In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

import sklearn

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

from IPython.display import clear_output

import pandas as pd
import numpy as np
import time
import string
from tqdm import notebook
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

# Laad data in

In [17]:
equal = pd.read_csv("../data/training_videos.csv").drop("Unnamed: 0", axis = 1)
equal["full_text"].fillna("", inplace=True)

equal.head()

Unnamed: 0,channel,video_id,conspiracy,full_text
0,UCH9gafn41oPg7s6bQwiMwTg,hgOdDCd6N10,False,islam beat femin right end stone begin motherf...
1,UCHr26UnUE_FHYtAXc_ddfgg,um-kBMGmW5k,False,24 hour burn man 2019 micro edit littl tast bu...
2,UCsu6NM_ARGWzDwL1C9xEhqA,9Z2CMOdLoB8,False,polic reform time turmoil injustic world cri o...
3,UCTjxmzChimJa3X_rAgLAnxg,GNJny3dtmoQ,False,career job career support video-mak donat matt...
4,UCaoSDiNkFmGQfvhvZMWxOHw,lOoCe0OZgvc,False,socrat abl predict civil unrest comment curren...


# Vectorize

In [18]:
# Vectorize words
v = TfidfVectorizer(max_df=.75, min_df=2)
x = v.fit_transform(equal['full_text'])

x

<44312x134844 sparse matrix of type '<class 'numpy.float64'>'
	with 22015145 stored elements in Compressed Sparse Row format>

In [19]:
x.shape

(44312, 134844)

In [20]:
X = x
Y = equal['conspiracy'].values.ravel()

# 10% of the data will be the test set
X_trainval, X_test, y_trainval, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1,\
                                                                                  random_state=0)

# of the remaining 90%, 90% will be training set
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_trainval, y_trainval,\
                                                                              train_size=0.9,\
                                                                              random_state=0)

# K-nearest neighbors

In [21]:
measures = {"K":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

for k in notebook.tqdm(range(1, 11)):
    knn = KNeighborsClassifier(k).fit(X_train, y_train)

    y_pred = knn.predict(X_valid)

    measures["K"].append(k)
    measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
    measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
    measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
    measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))

df_knn = pd.DataFrame(measures)
df_knn

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,K,Accuracy,Precision,Recall,F1
0,1,0.707874,0.646959,0.941032,0.766767
1,2,0.701354,0.650822,0.89484,0.753569
2,3,0.652708,0.601817,0.94398,0.73503
3,4,0.822217,0.822785,0.830467,0.826608
4,5,0.805416,0.75332,0.919902,0.828319
5,6,0.819458,0.813543,0.838329,0.82575
6,7,0.801655,0.755127,0.904668,0.823161
7,8,0.810181,0.796382,0.843735,0.819375
8,9,0.793882,0.753872,0.885012,0.814195
9,10,0.802658,0.784932,0.844717,0.813728


# Naïve Bayes  
### (Currently cannot function because of required array size)

In [1]:
# X_train_dense = X_train.toarray()

# measures = {"NB":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

# for i in notebook.tqdm([GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB()]):
#     nb = i.fit(X_train_dense, y_train)
#     y_pred = nb.predict(X_valid.toarray())

#     measures["NB"].append(str(i))
#     measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
#     measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
#     measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
#     measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
    
# df_nb = pd.DataFrame(measures)
# df_nb

# Neural Network

In [None]:
measures = {"Activation function":[], "# of hidden layers":[], "Neurons per layer":[],
            "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}

done = 0
left = 4*3*3
start = time.time()

for act in ['identity', 'logistic', 'tanh', 'relu']:
    for layers in [1, 10, 25]:
        for neurons in [1, 10, 50]:
            clear_output(wait=True)
            print(f"[{'#' * done}{' ' * left}] {done}/({done}+{left})")
            current = time.time()
            print(f"""\nHours:   {(current - start) // 3600}\nMinutes: {(current - start) // 60}\nSeconds: {(current - start) % 60}""")
            done += 1
            left -= 1
            
            mlp = MLPClassifier(hidden_layer_sizes=[neurons]*layers, activation=act,\
                    random_state=0).fit(X_train, y_train)

            y_pred = mlp.predict(X_test)
            measures["Activation function"].append(act)
            measures["# of hidden layers"].append(layers)
            measures["Neurons per layer"].append(neurons)
            
            measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, mlp.predict(X_valid)))
            measures["Precision"].append(sklearn.metrics.precision_score(y_valid, mlp.predict(X_valid)))
            measures["Recall"].append(sklearn.metrics.recall_score(y_valid, mlp.predict(X_valid)))
            measures["F1"].append(sklearn.metrics.f1_score(y_valid, mlp.predict(X_valid)))
            
df_nn = pd.DataFrame(measures)
df_nn

[########                            ]

Hours:   0.0
Minutes: 2.0
Seconds: 4.095452785491943


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-bfb57ce9bc69>", line 27, in <module>
    measures["Recall"].append(sklearn.metrics.recall_score(y_valid, mlp.predict(X_valid)))
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1789, in recall_score
    zero_division=zero_division)
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1490, in precision_recall_fscore_support
    labels=labels, samplewise=samplewise)
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 448, in multilabel_confusion_matrix
    y_true = le.transform(y_true)
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_label.py", line 273, in transform
    _, y = _encode(y, uniques=self.classes_, encode=T

KeyboardInterrupt: 

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 421, in execute_request
    self._abort_queues()
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 636, in _abort_queues
    self._abort_queue(stream)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 

Unnamed: 0,Activation function,# of hidden layers,Neurons per layer,Accuracy,Precision,Recall,F1
0,identity,1,1,0.724423,0.728293,0.733661,0.730967
1,identity,1,10,0.721916,0.723888,0.735627,0.72971
2,identity,1,50,0.719157,0.727952,0.717936,0.722909
3,identity,10,1,0.715145,0.713539,0.738084,0.725604
4,identity,10,10,0.718154,0.723833,0.723833,0.723833
5,identity,10,50,0.716901,0.716126,0.737592,0.726701
6,identity,25,1,0.71339,0.724121,0.708108,0.716025
7,identity,25,10,0.717653,0.726457,0.716462,0.721425
8,identity,25,50,0.715898,0.71415,0.739066,0.726395
9,logistic,1,1,0.744233,0.752865,0.742506,0.74765


In [16]:
results.to_csv("")

# Linear Regression

# Logistic Regression

# Support-vector machine

In [None]:
measures = {"Kernel":[], "C":[], "Accuracy":[], "Precision":[], "Recall":[], "F1":[]}

done = 0
left = 4*4
start = time.time()

for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    for C in [0.1, 1, 10, 100]:       
        svm = SVC(C=C, kernel=kernel, verbose=True).fit(X_train, y_train)
        
        y_pred = svm.predict(X_valid)

        measures["Kernel"].append(kernel)
        measures["C"].append(C)

        measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, y_pred))
        measures["Precision"].append(sklearn.metrics.precision_score(y_valid, y_pred))
        measures["Recall"].append(sklearn.metrics.recall_score(y_valid, y_pred))
        measures["F1"].append(sklearn.metrics.f1_score(y_valid, y_pred))
        
        clear_output(wait=True)
        print(f"[{'#' * done}{' ' * left}] {done}/({done}+{left})")
        current = time.time()
        print(f"""\nHours:   {(current - start) // 3600}\nMinutes: {(current - start) // 60}\nSeconds: {(current - start) % 60}""")
        done += 1
        left -= 1

        
df_svm = pd.DataFrame(measures)
df_svm

[LibSVM]