In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

import sklearn

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

from IPython.display import clear_output

import pandas as pd
import numpy as np
import time
import string
from tqdm import notebook
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

# Helpers

In [2]:
def progress(done, left, start):
    # Print bar
    clear_output(wait=True)
    print(f"[{'#' * done}{' ' * left}] {round(done/(done+left) * 100, 2)}% ({done}/{done+left})")
    
    # Bereken gepasseerde tijd
    current = time.time()
    hours = int((current - start) // 3600)
    minutes = int((current - start) // 60)
    seconds = int((current - start) % 60)
    
    # Print tijd
    print(f"""Time passed = {hours:02d}:{minutes:02d}:{seconds:02d}""")
    
    new_done = done + 1
    new_left = left - 1
    
    return new_done, new_left


# Laad data in

In [3]:
equal = pd.read_csv("../data/training_videos.csv").drop("Unnamed: 0", axis = 1)
equal["full_text"].fillna("", inplace=True)

equal.head()

Unnamed: 0,channel,video_id,conspiracy,full_text
0,UCH9gafn41oPg7s6bQwiMwTg,hgOdDCd6N10,False,islam beat femin right end stone begin motherf...
1,UCHr26UnUE_FHYtAXc_ddfgg,um-kBMGmW5k,False,24 hour burn man 2019 micro edit littl tast bu...
2,UCsu6NM_ARGWzDwL1C9xEhqA,9Z2CMOdLoB8,False,polic reform time turmoil injustic world cri o...
3,UCTjxmzChimJa3X_rAgLAnxg,GNJny3dtmoQ,False,career job career support video-mak donat matt...
4,UCaoSDiNkFmGQfvhvZMWxOHw,lOoCe0OZgvc,False,socrat abl predict civil unrest comment curren...


# Vectorize

In [4]:
# Vectorize words
v = TfidfVectorizer(max_df=.75, min_df=2)
x = v.fit_transform(equal['full_text'])

x

<44312x134844 sparse matrix of type '<class 'numpy.float64'>'
	with 22015145 stored elements in Compressed Sparse Row format>

In [5]:
x.shape

(44312, 134844)

In [6]:
X = x
Y = equal['conspiracy'].values.ravel()

# 10% of the data will be the test set
X_trainval, X_test, y_trainval, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1,\
                                                                                  random_state=0)

# of the remaining 90%, 90% will be training set
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_trainval, y_trainval,\
                                                                              train_size=0.9,\
                                                                              random_state=0)

# K-nearest neighbors

In [21]:
measures = {"K":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

for k in notebook.tqdm(range(1, 11)):
    knn = KNeighborsClassifier(k).fit(X_train, y_train)

    y_pred = knn.predict(X_valid)

    measures["K"].append(k)
    measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
    measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
    measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
    measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))

df_knn = pd.DataFrame(measures)
df_knn

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,K,Accuracy,Precision,Recall,F1
0,1,0.707874,0.646959,0.941032,0.766767
1,2,0.701354,0.650822,0.89484,0.753569
2,3,0.652708,0.601817,0.94398,0.73503
3,4,0.822217,0.822785,0.830467,0.826608
4,5,0.805416,0.75332,0.919902,0.828319
5,6,0.819458,0.813543,0.838329,0.82575
6,7,0.801655,0.755127,0.904668,0.823161
7,8,0.810181,0.796382,0.843735,0.819375
8,9,0.793882,0.753872,0.885012,0.814195
9,10,0.802658,0.784932,0.844717,0.813728


# Naïve Bayes  
### (Currently cannot function because of required array size)

In [1]:
# X_train_dense = X_train.toarray()

# measures = {"NB":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

# for i in notebook.tqdm([GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB()]):
#     nb = i.fit(X_train_dense, y_train)
#     y_pred = nb.predict(X_valid.toarray())

#     measures["NB"].append(str(i))
#     measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
#     measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
#     measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
#     measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
    
# df_nb = pd.DataFrame(measures)
# df_nb

# Neural Network

## Hyperparameter tuning

In [None]:
measures = {"Activation function":[], "# of hidden layers":[], "Neurons per layer":[],
            "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

done = 0
left = 4*3*3
start = time.time()

for act in ['identity', 'logistic', 'tanh', 'relu']:
    for layers in [1, 10, 25]:
        for neurons in [1, 10, 25]:
            # Progress bar
            done, left = progress(done, left, start)
            
            # Fit
            mlp = MLPClassifier(hidden_layer_sizes=[neurons]*layers, activation=act,\
                    random_state=0).fit(X_train, y_train)

            # Predict en sla waardes op
            y_pred = mlp.predict(X_test)
            measures["Activation function"].append(act)
            measures["# of hidden layers"].append(layers)
            measures["Neurons per layer"].append(neurons)
            
            measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, mlp.predict(X_valid)))
            measures["Precision"].append(sklearn.metrics.precision_score(y_valid, mlp.predict(X_valid)))
            measures["Recall"].append(sklearn.metrics.recall_score(y_valid, mlp.predict(X_valid)))
            measures["F1"].append(sklearn.metrics.f1_score(y_valid, mlp.predict(X_valid)))
            
df_nn = pd.DataFrame(measures)
df_nn

## Optimal configuration

In [7]:
mlp = MLPClassifier(hidden_layer_sizes=[1], activation=act,\
                    random_state=0).fit(X_train, y_train)

identity:
0.8706118355065195
0.8743223262690981
0.8717444717444718
0.8730314960629922

logistic:
0.48971915747241723
0.0
0.0
0.0

tanh:
0.8570712136409228
0.8762198253723678
0.8383292383292383
0.8568558513309894

relu:
0.8610832497492478
0.866765725606736
0.85995085995086
0.863344844597928



# Linear Regression

In [None]:
linr = LinearRegression(n_jobs=-1).fit(X_train, y_train)

y_pred = linr.predict(X_valid)

print(sklearn.metrics.accuracy_score(y_valid, y_pred))
print(sklearn.metrics.precision_score(y_valid, y_pred))
print(sklearn.metrics.recall_score(y_valid, y_pred))
print(sklearn.metrics.f1_score(y_valid, y_pred))

# Logistic Regression

In [7]:
measures = {"Penalty":[], "C":[], "Solver":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

done = 0
left = 4*3*3
start = time.time()

for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']:
    for pen in ['l1', 'l2', 'none']:
        for C in [1, 10, 20]:
            try:
                # Print progress
                done, left = progress(done, left, start)
                
                # Train solver
                logr = LogisticRegression(penalty=pen, 
                                          C=C, 
                                          solver=solver,
                                          n_jobs=-1).fit(X_train, y_train)

                # Predict
                y_pred = logr.predict(X_valid)

                measures["Solver"].append(solver)
                measures["Penalty"].append(pen)
                measures["C"].append(C)

                measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
                measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
                measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
                measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
            except:
                pass
            
df_logr = pd.DataFrame(measures)
df_logr

[############################################### ] 97.92% (47/48)
Time passed = 00:18:48


Unnamed: 0,Penalty,C,Solver,Accuracy,Precision,Recall,F1
0,l1,0.1,saga,0.732949,0.762162,0.692875,0.725869
1,l1,1.0,saga,0.827232,0.844422,0.810811,0.827275
2,l1,10.0,saga,0.861334,0.871988,0.853563,0.862677
3,l2,0.1,newton-cg,0.794634,0.819328,0.766585,0.792079
4,l2,0.1,lbfgs,0.794634,0.819328,0.766585,0.792079
5,l2,0.1,sag,0.794634,0.819328,0.766585,0.792079
6,l2,0.1,saga,0.794634,0.819328,0.766585,0.792079
7,l2,1.0,newton-cg,0.845035,0.859098,0.832924,0.845808
8,l2,1.0,lbfgs,0.845035,0.859098,0.832924,0.845808
9,l2,1.0,sag,0.845035,0.859098,0.832924,0.845808


# Support-vector machine

In [None]:
measures = {"Kernel":[], "C":[], "Accuracy":[], "Precision":[], "Recall":[], "F1":[]}

done = 0
left = 4*4
start = time.time()

for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    for C in [0.1, 1, 10, 100]:       
        svm = SVC(C=C, kernel=kernel, verbose=True, max_iter = 500).fit(X_train, y_train)
        
        y_pred = svm.predict(X_valid)

        measures["Kernel"].append(kernel)
        measures["C"].append(C)

        measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
        measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
        measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
        measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
        
        clear_output(wait=True)
        print(f"[{'#' * done}{' ' * left}] {done}/({done}+{left})")
        current = time.time()
        print(f"""\nHours:   {(current - start) // 3600}\nMinutes: {(current - start) // 60}\nSeconds: {(current - start) % 60}""")
        done += 1
        left -= 1

        
df_svm = pd.DataFrame(measures)
df_svm

[LibSVM]

In [20]:
b = time.time()
svm = SVC(C=1, kernel="rbf", max_iter = 15000).fit(X_train, y_train)
print(time.time() - b)

y_pred = svm.predict(X_valid)

print(sklearn.metrics.accuracy_score(y_valid, y_pred))
print(sklearn.metrics.precision_score(y_valid, y_pred))
print(sklearn.metrics.recall_score(y_valid, y_pred))
print(sklearn.metrics.f1_score(y_valid, y_pred))

2453.074837207794
0.8753761283851554
0.8968008255933952
0.8540540540540541
0.8749056128869871
