In [1]:
import sklearn
from sklearn.neural_network import MLPClassifier
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from tqdm import notebook

In [2]:
df = pd.read_csv("../data/channels.csv").drop("Unnamed: 0", axis=1)

In [17]:
df.head()

Unnamed: 0,channel,channel_description,country,keywords,uploads,conspiracy
0,UCgzqRRDGThOlH4EHaqSXXPA,,GB,,UUgzqRRDGThOlH4EHaqSXXPA,True
1,UCfI5jzpoUbwP4wkmQ6ZNqbA,"""The Stoa is not a fucking YouTube channel.""\n...",CA,,UUfI5jzpoUbwP4wkmQ6ZNqbA,False
2,UCByZMNYpHFEetI0s3deYH2g,,US,"politics ""give them an argument"" progressive l...",UUByZMNYpHFEetI0s3deYH2g,False
3,UC3VlH7lPbKlzVda5aJgyUvQ,Welcome to 3D to 5D Consciousness channel!!\n\...,PT,"""law of attraction"" ascension awakening",UU3VlH7lPbKlzVda5aJgyUvQ,True
4,UCz-1G_PzQZ5fZVYjfoz_4CQ,,US,,UUz-1G_PzQZ5fZVYjfoz_4CQ,True


In [18]:
df["keywords"] = df["keywords"].fillna("")
df["channel_description"] = df["channel_description"].fillna("")

In [19]:
c = df[df["conspiracy"] == True]
nc = df[df["conspiracy"] == False].sample(len(c))

equal = pd.concat([nc, c]).reset_index().drop("index", axis=1)
equal.head()

Unnamed: 0,channel,channel_description,country,keywords,uploads,conspiracy
0,UCxc0Frhn1Pg-k6CpIbzOqQw,mister metokur live stream archive,US,"""mister metokur"" live stream metokur ""rabbi el...",UUxc0Frhn1Pg-k6CpIbzOqQw,False
1,UCIoH_r4sl6bIxoSx0Oxckuw,,,,UUIoH_r4sl6bIxoSx0Oxckuw,False
2,UC9A_wj7G-zjcyELg32NVntw,Thank you for taking time to check out our You...,,,UU9A_wj7G-zjcyELg32NVntw,False
3,UCJfhyQqYUuUUCyLb3ZaNOOw,Israelite Congregation Based In South Florida,,"""Thee Light of Zion"" ""Hebrew Israelite""",UUJfhyQqYUuUUCyLb3ZaNOOw,False
4,UCpYCxV51bykhMY-wSUozQRg,Paid for by The Lincoln Project. Not authorize...,US,,UUpYCxV51bykhMY-wSUozQRg,False


In [20]:
equal["text"] = equal["channel_description"] + equal["keywords"]

In [21]:
equal.head()

Unnamed: 0,channel,channel_description,country,keywords,uploads,conspiracy,text
0,UCxc0Frhn1Pg-k6CpIbzOqQw,mister metokur live stream archive,US,"""mister metokur"" live stream metokur ""rabbi el...",UUxc0Frhn1Pg-k6CpIbzOqQw,False,"mister metokur live stream archive""mister meto..."
1,UCIoH_r4sl6bIxoSx0Oxckuw,,,,UUIoH_r4sl6bIxoSx0Oxckuw,False,
2,UC9A_wj7G-zjcyELg32NVntw,Thank you for taking time to check out our You...,,,UU9A_wj7G-zjcyELg32NVntw,False,Thank you for taking time to check out our You...
3,UCJfhyQqYUuUUCyLb3ZaNOOw,Israelite Congregation Based In South Florida,,"""Thee Light of Zion"" ""Hebrew Israelite""",UUJfhyQqYUuUUCyLb3ZaNOOw,False,"Israelite Congregation Based In South Florida""..."
4,UCpYCxV51bykhMY-wSUozQRg,Paid for by The Lincoln Project. Not authorize...,US,,UUpYCxV51bykhMY-wSUozQRg,False,Paid for by The Lincoln Project. Not authorize...


In [23]:
v = TfidfVectorizer()
x = v.fit_transform(equal['text'])

x

<4366x28647 sparse matrix of type '<class 'numpy.float64'>'
	with 217978 stored elements in Compressed Sparse Row format>

In [25]:
import warnings
warnings.filterwarnings('ignore')

X = x
Y = equal['conspiracy'].values.ravel()

# 10% of the data will be the test set
X_trainval, X_test, y_trainval, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.1,\
                                                                                  random_state=0)

# of the remaining 90%, 90% will be training set
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X_trainval, y_trainval,\
                                                                              train_size=0.9,\
                                                                              random_state=0)


measures = {"activation":[], "Nbr. of layers":[], "neurons":[], "Accuracy":[], "Precision":[],"Recall":[], "F1":[]}


for i in notebook.tqdm(["identity", "logistic", "tanh", "relu"]):
    for j in [1, 10, 25]:
        for k in [1, 10, 50]:
            print(f"current: {i} with {k} layer(s) of {j} neuron(s)")
            mlp = MLPClassifier(hidden_layer_sizes=[j]*k, activation=i,\
                                random_state=0).fit(X_train, y_train)

            measures["activation"].append(i)
            measures["Nbr. of layers"].append(k)
            measures["neurons"].append(j)
            measures["Accuracy"].append(sklearn.metrics.accuracy_score(y_valid, mlp.predict(X_valid)))
            measures["Precision"].append(sklearn.metrics.precision_score(y_valid, mlp.predict(X_valid)))
            measures["Recall"].append(sklearn.metrics.recall_score(y_valid, mlp.predict(X_valid)))
            measures["F1"].append(sklearn.metrics.f1_score(y_valid, mlp.predict(X_valid)))

print("Predicting now")
y_pred = mlp.predict(X_test)

  0%|          | 0/4 [00:00<?, ?it/s]

current: identity with 1 layers of 1
current: identity with 10 layers of 1
current: identity with 50 layers of 1
current: identity with 1 layers of 10
current: identity with 10 layers of 10
current: identity with 50 layers of 10
current: identity with 1 layers of 25
current: identity with 10 layers of 25
current: identity with 50 layers of 25
current: logistic with 1 layers of 1
current: logistic with 10 layers of 1
current: logistic with 50 layers of 1
current: logistic with 1 layers of 10
current: logistic with 10 layers of 10
current: logistic with 50 layers of 10
current: logistic with 1 layers of 25
current: logistic with 10 layers of 25
current: logistic with 50 layers of 25
current: tanh with 1 layers of 1
current: tanh with 10 layers of 1
current: tanh with 50 layers of 1
current: tanh with 1 layers of 10
current: tanh with 10 layers of 10
current: tanh with 50 layers of 10
current: tanh with 1 layers of 25
current: tanh with 10 layers of 25
current: tanh with 50 layers of 25
c

In [26]:
pd.DataFrame(measures)

Unnamed: 0,activation,Nbr. of layers,neurons,Accuracy,Precision,Recall,F1
0,identity,1,1,0.821883,0.811321,0.851485,0.830918
1,identity,10,1,0.816794,0.806604,0.846535,0.826087
2,identity,50,1,0.486005,0.0,0.0,0.0
3,identity,1,10,0.814249,0.805687,0.841584,0.823245
4,identity,10,10,0.826972,0.801802,0.881188,0.839623
5,identity,50,10,0.80916,0.777293,0.881188,0.825986
6,identity,1,25,0.816794,0.806604,0.846535,0.826087
7,identity,10,25,0.824427,0.884393,0.757426,0.816
8,identity,50,25,0.819338,0.861878,0.772277,0.814621
9,logistic,1,1,0.821883,0.814286,0.846535,0.830097


In [30]:
mlp = MLPClassifier(hidden_layer_sizes=[10]*10, activation="tanh").fit(X_train, y_train)

acc = sklearn.metrics.accuracy_score(y_test, mlp.predict(X_test))
pr = sklearn.metrics.precision_score(y_test, mlp.predict(X_test))
rec = sklearn.metrics.recall_score(y_test, mlp.predict(X_test))
f1 = sklearn.metrics.f1_score(y_test, mlp.predict(X_test))

acc, pr, rec, f1

(0.7688787185354691,
 0.7341269841269841,
 0.8447488584474886,
 0.7855626326963906)