In [4]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE

In [5]:
df = pd.read_csv("../data/dataset_citizen_participation_training_extended.csv", sep=";")

In [6]:
labels_over_500_entries = df["label"].value_counts()[df["label"].value_counts() >= 500].index

In [7]:
df_over_500_entries = df[df["label"].isin(labels_over_500_entries)]
df_over_500_entries

Unnamed: 0,label,sentence
0,0,"Veranstaltungen, Entspannung"
1,0,denkabr wäre Kunst und
2,0,mehr Entspannungs- und Freizeitmöglichkeiten
3,0,"Freizeitmöglichkeit mit Wiese, eventuell Parkä..."
4,0,Nutzungsraum für Feste und Veranstaltungen im ...
...,...,...
30438,8,"Ich finde, dass wir in unserer Stadt dringend ..."
30439,8,"„Ich finde, dass wir mehr Mittel in die Aussta..."
30440,8,"Ich finde, dass wir dringend mehr Spiel- und L..."
30441,8,"Ich finde, dass wir dringend mehr Plätze in Ki..."


In [8]:
df_over_500_entries["label"].value_counts()

label
3    7558
2    3511
5    3316
4    2784
8    2555
6    2237
9    2188
7    2088
0    2042
1    1753
Name: count, dtype: int64

In [9]:
import spacy

nlp = spacy.load("de_core_news_lg")

In [10]:
def lemma_no_punct_no_stop(sentence):
    return " ".join([token.lemma_ for token in nlp(sentence) if not (token.is_punct or token.is_stop)])

In [11]:
df_over_500_entries["lemmatized_sentence"] = df_over_500_entries["sentence"].apply(lemma_no_punct_no_stop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_over_500_entries["lemmatized_sentence"] = df_over_500_entries["sentence"].apply(lemma_no_punct_no_stop)


In [12]:
df_over_500_entries.head()

Unnamed: 0,label,sentence,lemmatized_sentence
0,0,"Veranstaltungen, Entspannung",Veranstaltung Entspannung
1,0,denkabr wäre Kunst und,Denkabr Kunst
2,0,mehr Entspannungs- und Freizeitmöglichkeiten,Entspannung Freizeitmöglichkeit
3,0,"Freizeitmöglichkeit mit Wiese, eventuell Parkä...",Freizeitmöglichkeit Wiese eventuell Parkähnlich
4,0,Nutzungsraum für Feste und Veranstaltungen im ...,Nutzungsraum Fest Veranstaltung freie


In [13]:
def vectorize(sentence):
    return nlp(sentence).vector

In [14]:
df_over_500_entries["vectorized_sentence"] = df_over_500_entries["lemmatized_sentence"].apply(vectorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_over_500_entries["vectorized_sentence"] = df_over_500_entries["lemmatized_sentence"].apply(vectorize)


In [15]:
df_test = pd.read_csv("../data/output_file_citizen_unique_testdata.csv", sep=";")

In [18]:
df_test["lemmatized_new_sentences"] = df_test["new_sentences"].apply(lemma_no_punct_no_stop)

In [19]:
df_test["vectorized_new_sentences"] = df_test["lemmatized_new_sentences"].apply(vectorize)

In [20]:
df_test.head()

Unnamed: 0,new_labels,new_sentences,lemmatized_new_sentences,vectorized_new_sentences
0,9,nix,nix,"[-4.8072, 1.9515, 9.36, -1.1409, -2.6833, 5.60..."
1,9,keine,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,9,i.O.?,i.O.,"[-3.0574, 1.1077, 2.1435, -3.6359, -0.94273, 0..."
3,9,Keine.,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,9,schild,Schild,"[-1.1334, -0.88104, 0.49688, -1.1014, 3.2141, ..."


In [21]:
df_over_500_entries.head()

Unnamed: 0,label,sentence,lemmatized_sentence,vectorized_sentence
0,0,"Veranstaltungen, Entspannung",Veranstaltung Entspannung,"[0.52938, 0.755215, 0.09990001, 0.75145, -1.07..."
1,0,denkabr wäre Kunst und,Denkabr Kunst,"[-0.92885, 0.8081, 1.00135, 0.5814, -1.2075, -..."
2,0,mehr Entspannungs- und Freizeitmöglichkeiten,Entspannung Freizeitmöglichkeit,"[-0.53087, -0.029850006, 0.53752, 1.377835, -2..."
3,0,"Freizeitmöglichkeit mit Wiese, eventuell Parkä...",Freizeitmöglichkeit Wiese eventuell Parkähnlich,"[-0.107137516, -0.61556005, -0.090515, -0.4693..."
4,0,Nutzungsraum für Feste und Veranstaltungen im ...,Nutzungsraum Fest Veranstaltung freie,"[-0.15308248, -0.776075, -0.36971247, 0.4225, ..."


In [22]:
X = np.vstack(df_over_500_entries["vectorized_sentence"])
y = df_over_500_entries["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print(acc)

0.7467954053604129


In [23]:
smote = SMOTE(random_state=42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# X = np.vstack(df_over_500_entries["vectorized_sentence"])
# y = df_over_500_entries["label"]

X_resampled, y_resampled = smote.fit_resample(X, y)

X_test = np.vstack(df_test["vectorized_new_sentences"])
y_test = df_test["new_labels"]

In [24]:
y_resampled.value_counts()

label
0    7558
1    7558
2    7558
3    7558
4    7558
5    7558
6    7558
7    7558
8    7558
9    7558
Name: count, dtype: int64

In [25]:
import xgboost as xgb

knn_model = KNeighborsClassifier(n_neighbors=5)
svm_model = svm.SVC()
xgbc = xgb.XGBClassifier(learning_rate=0.3, max_depth=6, n_estimators=200, subsample=1.0, colsample_bytree=1.0, reg_lambda=1)

In [26]:
def test_models(models: list, X_train, y_train, X_test ,y_test):
	for model in models:
		model.fit(X_train, y_train)
		y_pred = model.predict(X_test)
		print(model)
		print(accuracy_score(y_test, y_pred))

In [27]:
test_models([xgbc], X_resampled, y_resampled, X_test, y_test)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
0.9233333333333333
