In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

import sklearn

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

from IPython.display import clear_output

import pandas as pd
import numpy as np
import time
import string
from tqdm import notebook
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

# Helpers

In [2]:
def progress(done, left, start):
    # Print bar
    clear_output(wait=True)
    print(f"[{'#' * done}{' ' * left}] {round(done/(done+left) * 100, 2)}% ({done}/{done+left})")
    
    # Bereken gepasseerde tijd
    current = time.time()
    hours = int((current - start) // 3600)
    minutes = int((current - start) % 3600 // 60)
    seconds = int((current - start) % 60)
    
    # Print tijd
    print(f"""Time passed = {hours:02d}:{minutes:02d}:{seconds:02d}""")
    
    new_done = done + 1
    new_left = left - 1
    
    return new_done, new_left


# Laad data in

In [3]:
equal = pd.read_csv("../data/training_videos.csv").drop("Unnamed: 0", axis = 1)
equal["full_text"].fillna("", inplace=True)

equal.head()

Unnamed: 0,channel,video_id,conspiracy,full_text
0,UCyuAKnN3g2fZ7_R9irgEUZQ,SECdW8n4ZGw,False,abraham lincoln came power abraham lincoln com...
1,UCAtdV6VWa02UpCB2YHuVmHg,jSXIrZx8lI4,False,thanksgiv celebr announc union rescu mission n...
2,UCASQzDb4SUC0gZNPbwBubIQ,s0YjJVyyVjI,False,unc health s mobil clinic aim reduc racial eth...
3,UC_giJ3xlEL9jUF1YfJdzzuQ,VMHCvdRdq10,False,establish impur accept criteria part spec dmf ...
4,UCT8RMFbTJV5ILaVykrluOQg,KKiR4FWOZ4w,False,ten panda cub make public debut ahead spring f...


# Vectorize

In [4]:
# Vectorize words
v = TfidfVectorizer(max_df=.75, min_df=2)
x = v.fit_transform(equal['full_text'])

x

<44312x140970 sparse matrix of type '<class 'numpy.float64'>'
	with 23152699 stored elements in Compressed Sparse Row format>

In [5]:
x.shape

(44312, 140970)

In [6]:
X = x
Y = equal['conspiracy'].values.ravel()

# 10% of the data will be the test set
X_trainval, X_test, y_trainval, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1,\
                                                                                  random_state=0)

# of the remaining 90%, 90% will be training set
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_trainval, y_trainval,\
                                                                              train_size=0.9,\
                                                                              random_state=0)

# Hyperparameter tuning

## K-nearest neighbors

In [9]:
measures = {"K":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

for k in notebook.tqdm(range(1, 16)):
    knn = KNeighborsClassifier(k).fit(X_train, y_train)

    y_pred = knn.predict(X_valid)

    measures["K"].append(k)
    measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
    measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
    measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
    measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))

df_knn = pd.DataFrame(measures)
df_knn.to_csv("k-nearest_neighbors.csv")

  0%|          | 0/15 [00:00<?, ?it/s]

## Neural Network

In [7]:
measures = {"Activation function":[], "# of hidden layers":[], "Neurons per layer":[],
            "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

done = 0
left = 4*3*3
start = time.time()

for act in ['identity', 'logistic', 'tanh', 'relu']:
    for layers in [1, 10, 25]:
        for neurons in [1, 10, 20]:
            # Progress bar
            done, left = progress(done, left, start)
            
            # Fit
            mlp = MLPClassifier(hidden_layer_sizes=[neurons]*layers, activation=act,\
                    random_state=0).fit(X_train, y_train)

            # Predict en sla waardes op
            y_pred = mlp.predict(X_test)
            measures["Activation function"].append(act)
            measures["# of hidden layers"].append(layers)
            measures["Neurons per layer"].append(neurons)
            
            measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, mlp.predict(X_valid)))
            measures["Precision"].append(sklearn.metrics.precision_score(y_valid, mlp.predict(X_valid)))
            measures["Recall"].append(sklearn.metrics.recall_score(y_valid, mlp.predict(X_valid)))
            measures["F1"].append(sklearn.metrics.f1_score(y_valid, mlp.predict(X_valid)))
            
df_nn = pd.DataFrame(measures)
df_nn.to_csv("neural_network.csv")

[################################### ] 97.22% (35/36)
Time passed = 08:18:00


## Support-vector machine

In [None]:
measures = {"Kernel":[], "C":[], "Accuracy":[], "Precision":[], "Recall":[], "F1":[]}

done = 0
left = 4*4
start = time.time()

for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    for C in [0.1, 1, 10, 100]:       
        # Print progress
        done, left = progress(done, left, start)
        
        # Train
        svm = SVC(C=C, kernel=kernel, max_iter = 10000).fit(X_train, y_train)
        
        # Predict
        y_pred = svm.predict(X_valid)

        # Store
        measures["Kernel"].append(kernel)
        measures["C"].append(C)

        measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
        measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
        measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
        measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
        
df_svm = pd.DataFrame(measures)
df_svm.to_csv("support-vector_machine.csv")

[                ] 0.0% (0/16)
Time passed = 00:00:00


## Logistic Regression

In [9]:
measures = {"Penalty":[], "C":[], "Solver":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

done = 0
left = 4*3*3
start = time.time()

for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']:
    for pen in ['l1', 'l2', 'none']:
        for C in [1, 10, 20]:
            try:
                # Print progress
                done, left = progress(done, left, start)
                
                # Train solver
                logr = LogisticRegression(penalty=pen, 
                                          C=C, 
                                          solver=solver,
                                          n_jobs=-1).fit(X_train, y_train)

                # Predict
                y_pred = logr.predict(X_valid)

                measures["Solver"].append(solver)
                measures["Penalty"].append(pen)
                measures["C"].append(C)

                measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
                measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
                measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
                measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
            except:
                pass
            
df_logr = pd.DataFrame(measures)
df_logr.to_csv("logistic_regression.csv")

[################################### ] 97.22% (35/36)
Time passed = 00:20:07


## Ridge Classification

In [10]:
measures = {"Solver":[], "Alpha":[], "Accuracy":[], "Precision":[], "Recall":[], "F1":[]}

done = 0
left = 3*4
start = time.time()

for solver in ['auto', 'sparse_cg', 'sag']:
    for alpha in [0.1, 1, 10, 100]:
        # Print progress
        done, left = progress(done, left, start)
        
        ridge = RidgeClassifier(solver=solver, alpha=alpha).fit(X_train, y_train)

        y_pred = ridge.predict(X_valid)
        
        measures["Solver"].append(solver)
        measures["Alpha"].append(alpha)

        measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
        measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
        measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
        measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))

df_ridge = pd.DataFrame(measures)
df_ridge.to_csv("ridge_classification.csv")

[########### ] 91.67% (11/12)
Time passed = 00:01:10


# Optimal configuration

## K-nearest neighbors

In [7]:
knn = KNeighborsClassifier(1).fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Accuracy :", sklearn.metrics.accuracy_score(y_test, y_pred))
print("Precision:", sklearn.metrics.precision_score(y_test, y_pred))
print("Recall   :", sklearn.metrics.recall_score(y_test, y_pred))
print("F1       :", sklearn.metrics.f1_score(y_test, y_pred))

Accuracy : 0.8786101083032491
Precision: 0.8621752531924263
Recall   : 0.896930829134219
F1       : 0.8792096991468343


## Neural network

In [8]:
mlp = MLPClassifier(hidden_layer_sizes=[10]*10, activation="identity",
                    random_state=0).fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print("Accuracy :", sklearn.metrics.accuracy_score(y_test, y_pred))
print("Precision:", sklearn.metrics.precision_score(y_test, y_pred))
print("Recall   :", sklearn.metrics.recall_score(y_test, y_pred))
print("F1       :", sklearn.metrics.f1_score(y_test, y_pred))

Accuracy : 0.9217057761732852
Precision: 0.921875
Recall   : 0.918918918918919
F1       : 0.9203945859142005


## Support-vector machine

In [None]:
svm = SVC(C=10, kernel="rbf", max_iter = 10000).fit(X_train, y_train)

y_pred = svm.predict(X_test)

print("Accuracy :", sklearn.metrics.accuracy_score(y_test, y_pred))
print("Precision:", sklearn.metrics.precision_score(y_test, y_pred))
print("Recall   :", sklearn.metrics.recall_score(y_test, y_pred))
print("F1       :", sklearn.metrics.f1_score(y_test, y_pred))

## Logistic Regression

In [11]:
# Train solver
logr = LogisticRegression(penalty="l2", 
                          C=20, 
                          solver="newton-cg",
                          n_jobs=-1).fit(X_train, y_train)

y_pred = logr.predict(X_test)

print("Accuracy :", sklearn.metrics.accuracy_score(y_test, y_pred))
print("Precision:", sklearn.metrics.precision_score(y_test, y_pred))
print("Recall   :", sklearn.metrics.recall_score(y_test, y_pred))
print("F1       :", sklearn.metrics.f1_score(y_test, y_pred))

Accuracy : 0.9104241877256317
Precision: 0.9055404178019982
Recall   : 0.913421896472744
F1       : 0.9094640820980615


## Ridge Classifier

In [12]:
ridge = RidgeClassifier(solver="auto", alpha=0.1).fit(X_train, y_train)

y_pred = ridge.predict(X_test)

print("Accuracy :", sklearn.metrics.accuracy_score(y_test, y_pred))
print("Precision:", sklearn.metrics.precision_score(y_test, y_pred))
print("Recall   :", sklearn.metrics.recall_score(y_test, y_pred))
print("F1       :", sklearn.metrics.f1_score(y_test, y_pred))

Accuracy : 0.9131317689530686
Precision: 0.9086363636363637
Recall   : 0.9157123224919835
F1       : 0.9121606205795119


# Hybrid Machine Learning

In [None]:
# Predict the label for every video with every optimal model
hybrid = equal[["video_id", "conspiracy"]]
hybrid["ridge"] = ridge.predict(x)
hybrid["svm"] = svm.predict(x)
hybrid["neural_network"] = mlp.predict(x)
hybrid["log_reg"] = logr.predict(x)
hybrid["knn"] = knn.predict(x)

hybrid.head()

In [None]:
x2 = hybrid[["svm", "neural_network"]].values

In [None]:
X = x2
Y = hybrid['conspiracy'].values.ravel()

# 10% of the data will be the test set
X_trainval, X_test, y_trainval, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1,\
                                                                                  random_state=0)

# of the remaining 90%, 90% will be training set
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_trainval, y_trainval,\
                                                                              train_size=0.9,\
                                                                              random_state=0)