## Start

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("../data/dataset_citizen_participation_training_extended.csv", sep=";")

In [5]:
labels_over_500_entries = df["label"].value_counts()[df["label"].value_counts() >= 500].index

In [6]:
df_over_500_entries = df[df["label"].isin(labels_over_500_entries)]
df_over_500_entries

Unnamed: 0,label,sentence
0,0,"Veranstaltungen, Entspannung"
1,0,denkabr wäre Kunst und
2,0,mehr Entspannungs- und Freizeitmöglichkeiten
3,0,"Freizeitmöglichkeit mit Wiese, eventuell Parkä..."
4,0,Nutzungsraum für Feste und Veranstaltungen im ...
...,...,...
30438,8,"Ich finde, dass wir in unserer Stadt dringend ..."
30439,8,"„Ich finde, dass wir mehr Mittel in die Aussta..."
30440,8,"Ich finde, dass wir dringend mehr Spiel- und L..."
30441,8,"Ich finde, dass wir dringend mehr Plätze in Ki..."


In [7]:
import spacy

nlp = spacy.load("de_core_news_lg")

In [8]:
def lemma_no_punct_no_stop(sentence):
    return " ".join([token.lemma_ for token in nlp(sentence) if not (token.is_punct or token.is_stop)])

In [9]:
df_over_500_entries["lemmatized_sentence"] = df_over_500_entries["sentence"].apply(lemma_no_punct_no_stop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_over_500_entries["lemmatized_sentence"] = df_over_500_entries["sentence"].apply(lemma_no_punct_no_stop)


In [33]:
import numpy as np
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('w2c_german.model', binary=True)
def vectorizer_word_2_vec(sentence):
    word_vectors = [model[token] for token in sentence if token in model]
    
    if not word_vectors:
        return np.zeros(model.vector_size)
    
    return np.mean(word_vectors, axis=0)


In [36]:
df_over_500_entries["vectorized_sentence"] = df_over_500_entries["lemmatized_sentence"].apply(vectorizer_word_2_vec)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_over_500_entries["vectorized_sentence"] = df_over_500_entries["lemmatized_sentence"].apply(vectorizer_word_2_vec)


In [50]:
df_over_500_entries.head()

Unnamed: 0,label,sentence,lemmatized_sentence,vectorized_sentence
0,0,"Veranstaltungen, Entspannung",Veranstaltung Entspannung,"[-0.16233182, 0.05000949, 0.061412875, 0.20537..."
1,0,denkabr wäre Kunst und,Denkabr Kunst,"[-0.050729413, 0.092643626, 0.017704606, 0.177..."
2,0,mehr Entspannungs- und Freizeitmöglichkeiten,Entspannung Freizeitmöglichkeit,"[-0.19545369, 0.09129564, 0.0472225, 0.1664562..."
3,0,"Freizeitmöglichkeit mit Wiese, eventuell Parkä...",Freizeitmöglichkeit Wiese eventuell Parkähnlich,"[-0.13682537, 0.10294958, 0.01084141, 0.164365..."
4,0,Nutzungsraum für Feste und Veranstaltungen im ...,Nutzungsraum Fest Veranstaltung freie,"[-0.13489759, 0.073086545, 0.021854544, 0.1971..."


In [38]:
df_test = pd.read_csv("../data/output_file_citizen_unique_testdata.csv", sep=";")

In [51]:
df_test["lemmatized_new_sentences"] = df_test["new_sentences"].apply(lemma_no_punct_no_stop)

In [52]:
df_test["vectorized_new_sentences"] = df_test["lemmatized_new_sentences"].apply(vectorizer_word_2_vec)

In [53]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X = np.vstack(df_over_500_entries["vectorized_sentence"])
y = df_over_500_entries["label"]

X_resampled, y_resampled = smote.fit_resample(X, y)

X_test = np.vstack(df_test["vectorized_new_sentences"])
y_test = df_test["new_labels"]

In [54]:
y_resampled.value_counts()

label
0    7558
1    7558
2    7558
3    7558
4    7558
5    7558
6    7558
7    7558
8    7558
9    7558
Name: count, dtype: int64

In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import xgboost as xgb

knn_model = KNeighborsClassifier(n_neighbors=5)
svm_model = svm.SVC()
xgbc = xgb.XGBClassifier(learning_rate=0.3, max_depth=6, n_estimators=200, subsample=1.0, colsample_bytree=1.0, reg_lambda=1)

In [56]:
def test_models(models: list, X_train, y_train, X_test ,y_test):
	for model in models:
		model.fit(X_train, y_train)
		y_pred = model.predict(X_test)
		print(model)
		print(accuracy_score(y_test, y_pred))

In [57]:
test_models([xgbc], X_resampled, y_resampled, X_test, y_test)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
0.8748148148148148
