# Supervised Classification with Word2Vec

We use the document vectors derived from Word2Vec to classify newspaper articles according to the type of crime they report. In this notebook, we use supervised algorithms to classify.

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

In [None]:
configuration = 1    # possible values are: 1, 2, 3
mean = 'tfidf'

###
# configuration=1 --> P1: tokenization, stop word removal
# configuration=2 --> P2: tokenization, stop word removal, lemmatization
# configuration=3 --> P3: tokenization, stop word removal, lemmatization, keyphrase extraction
###

folder="configuration_"+str(configuration)

if configuration == 1:
    lemmatization=False
    bigram=False
elif configuration == 2:
    lemmatization=True
    bigram=False
elif configuration == 3:
    lemmatization=True
    bigram=True

Load the .csv file resulted from the notebook named "feature_extraction.ipynb"

In [None]:
vectors = pd.read_csv(folder+"/"+mean+"/csv/retrained_vectors.csv")

In [None]:
vectors

In [None]:
vectors = vectors.reset_index(drop=True)

In [None]:
Y = vectors['target']
Y

In [None]:
label_to_index = {v: i for i, v in enumerate(dict(Counter(Y)).keys())}
label_to_index

In [None]:
index_to_label = {i: v  for i, v in enumerate(dict(Counter(Y)).keys())}
index_to_label

In [None]:
Y = Y.map(label_to_index)

In [None]:
Y = Y.values
Y

In [None]:
X = vectors.drop(columns=['url', 'title', 'newspaper', 'text', 'date', 'time', 'preprocessed', 'target']).values

In [None]:
X.shape

We split the dataset into training set and test set

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.340, random_state=42)

In [None]:
X_train.shape

In [None]:
Y_train

In [None]:
Y_test

In [None]:
Y = np.vectorize(index_to_label.get)(Y_train)

In [None]:
Counter(Y)

In [None]:
Counter(Y_train)

In [None]:
Counter(Y_test)

# 1. Random Forest

In [None]:
%%time

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(X_train, Y_train)

In [None]:
result = forest.predict(X_test)

In [None]:
result

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
prediction = Counter(np.equal(result, Y_test))
prediction[True] / (prediction[True] + prediction[False])

# 2. ExtraTreesClassifier

In [None]:
%%time

etc = ExtraTreesClassifier(n_estimators=200)
etc = etc.fit(X_train, Y_train)

In [None]:
result = etc.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
prediction = Counter(np.equal(result, Y_test))
prediction[True] / (prediction[True] + prediction[False])

# 3. Bernoulli

In [None]:
%%time

bnb = BernoulliNB()
bnb.fit(X_train, Y_train)

In [None]:
result = bnb.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
prediction = Counter(np.equal(result, Y_test))
prediction[True] / (prediction[True] + prediction[False])

In [None]:
confr3 = np.where(Y_test != result)

In [None]:
confr3[0].shape

# 4. Decision Tree

In [None]:
%%time

dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, Y_train)

In [None]:
result = dtc.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
prediction = Counter(np.equal(result, Y_test))
prediction[True] / (prediction[True] + prediction[False])

# 5. GaussianNB

In [None]:
%%time

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

In [None]:
result = gnb.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
confr5 = np.where(Y_test != result)

In [None]:
confr5[0].shape

# 6. KNeighborsClassifier

In [None]:
%%time

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, Y_train)

In [None]:
result = neigh.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
neigh3 = KNeighborsClassifier(n_neighbors=3)
neigh3.fit(X_train, Y_train)

In [None]:
result = neigh3.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
%%time

neigh5 = KNeighborsClassifier(n_neighbors=5)
neigh5.fit(X_train, Y_train)

In [None]:
result = neigh5.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

# 7. Linear SVC

In [None]:
svc = LinearSVC(random_state=0, tol=1e-5)
svc.fit(X_train, Y_train)

In [None]:
result = svc.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

# 8. SVC

In [None]:
scl = SVC()

In [None]:
scl.fit(X_train, Y_train)

In [None]:
result = scl.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

# 9. SGD Classifier

In [None]:
sgd = SGDClassifier(max_iter=1000, tol=1e-3)

In [None]:
sgd.fit(X_train, Y_train)

In [None]:
result = sgd.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
sgd2 = SGDClassifier(loss='perceptron', penalty='l1', max_iter=1000, tol=1e-3)

In [None]:
sgd2.fit(X_train, Y_train)

In [None]:
result = sgd2.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

# 10. AdaBoostClassifier 

In [None]:
adc = AdaBoostClassifier(DecisionTreeClassifier(random_state=0), n_estimators=100)

In [None]:
adc.fit(X_train, Y_train)

In [None]:
result = adc.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

# 11. BaggingClassifier

In [None]:
bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=0), n_estimators=10, random_state=0)

In [None]:
bc.fit(X_train, Y_train)

In [None]:
result = bc.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
bc2 = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=5), n_estimators=10, random_state=0)

In [None]:
bc2.fit(X_train, Y_train)

In [None]:
result = bc2.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

# 12. XGB Classifier

In [None]:
model = XGBClassifier(objective='multi:softmax')

In [None]:
model.fit(X_train, Y_train)

In [None]:
result = model.predict(X_test)

In [None]:
print(classification_report(np.vectorize(index_to_label.get)(Y_test), np.vectorize(index_to_label.get)(result)))

In [None]:
prediction = Counter(np.equal(result, Y_test))
prediction[True] / (prediction[True] + prediction[False])