In [1]:
from gensim.models import Word2Vec
import fasttext.util
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import roc_auc_score

In [2]:
model = fasttext.load_model("./ubertext.fiction_news_wikipedia.filter_rus+short.tokens.txt.algo-cbow.epochs-15.subwords-2..5.neg_sampling-15.bin")




In [3]:
# NON_PROPAGANDA NEWS LOADING

non_propaganda_news = ["./dataset/pravda_articles.csv", "./dataset/ukrinform.csv"]
df_non_prop = pd.read_csv("./dataset/tsn_articles.csv", delimiter="\t", lineterminator="\n")

df_non_prop.columns =["number", "Article"]
for f_ in non_propaganda_news:
    new_file = pd.read_csv(f_, delimiter="\t")
    new_file.columns =["number", "Article"]
    df_non_prop = pd.concat([df_non_prop, new_file], ignore_index=True, axis=0)

df_non_prop.columns = ["class", "article"]
df_non_prop["class"] = [0]*len(df_non_prop)


In [4]:
# PROPAGANDA NEWS LOADING

propaganda_news = ["./dataset/0strana_articles.csv", "./dataset/newsone.csv", "./dataset/112.csv"]

df_prop = pd.DataFrame(columns=["number", "Article"])
for f_ in propaganda_news:
    new_file = pd.read_csv(f_, delimiter="\t")
    new_file.columns =["number", "Article"]
    df_prop = pd.concat([df_prop, new_file], ignore_index=True, axis=0)
df_prop.columns = ["class", "article"]
df_prop["class"] = [1]*len(df_prop)
df_prop

Unnamed: 0,class,article
0,1,Російські нафта і газ підтримують позитивний б...
1,1,Фінансування Мінкульту не буде переглянуто у р...
2,1,Соратниця Санду пішла з виборів через скандал ...
3,1,"Супутникова система, що прибула в Україну. Фот..."
4,1,"контейнеровоз, що сів на мілину, шість днів бл..."
...,...,...
16412,1,"112.uaОлена ГолубєваЖурналіст, 112.uaНезаба..."
16413,1,Топ-Новина Журналісти незаконно закритих ...
16414,1,Починаючи з 6 лютого середньодобова темп...
16415,1,Починаючи з 6 лютого середньодобова темп...


In [5]:
# COMBINING DATASETS TO GET A VECTOR MATRIX
len_prop, len_non_prop = len(df_prop), len(df_non_prop)
df_combined = pd.concat([df_prop, df_non_prop], ignore_index=True)

l = len(df_combined)
df_features = pd.DataFrame(columns=list(range(300))+["class"], index=list(range(l)))
for i in range(l):
    df_features.iloc[i, :-1] = model.get_sentence_vector(df_combined.loc[i, "article"])
    df_features.loc[i, "class"] = df_combined.loc[i, "class"]
df_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,class
0,0.053785,0.030771,-0.024119,0.017724,0.012248,0.049205,0.040134,-0.007578,0.022239,-0.011448,...,0.024128,0.010302,0.01164,-0.068097,-0.000891,0.022995,0.014834,-0.011634,-0.000477,1
1,0.060539,0.036665,-0.01827,0.020474,0.031075,0.039284,0.052787,-0.010481,0.038217,-0.014943,...,0.02307,0.023217,0.019742,-0.073852,0.020249,0.030565,0.013555,-0.018821,-0.00516,1
2,0.045141,0.038666,-0.00998,0.021052,0.019275,0.033237,0.029213,-0.024069,0.013772,0.002028,...,0.030286,0.01986,0.019684,-0.057726,0.018682,0.026395,-0.000498,0.026569,-0.00517,1
3,0.048836,0.01772,-0.020393,0.0022,0.023625,0.037664,0.040669,-0.006856,0.0373,-0.004788,...,0.025089,0.015195,0.018691,-0.062463,0.011937,0.021975,0.009584,-0.01079,-0.001798,1
4,0.05222,0.018972,-0.007758,0.011744,0.015459,0.033291,0.032425,-0.005831,0.025484,0.000944,...,0.019612,0.009173,0.021169,-0.056847,0.012908,0.030069,0.012719,-0.006473,-0.002318,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144999,0.05662,0.027371,-0.02626,0.048068,0.027356,0.042534,0.033864,-0.016754,0.003167,0.001174,...,0.016539,0.003711,0.032465,-0.059208,0.022664,0.046177,0.00574,-0.002303,-0.000775,0
145000,0.059505,0.022722,-0.02149,0.019147,0.026896,0.042152,0.050826,-0.008525,0.023968,0.000975,...,0.014374,0.014451,0.025345,-0.065596,0.021373,0.03065,0.015581,-0.019273,0.00482,0
145001,0.031304,0.015849,-0.009609,0.012811,0.018374,0.040383,0.042436,-0.005907,0.008896,0.010836,...,0.028376,0.01423,0.018503,-0.067944,0.010874,0.020956,0.007556,-0.006693,0.010351,0
145002,0.061459,0.020769,-0.011358,0.018487,0.02958,0.04011,0.046455,-0.008975,0.031053,-0.003237,...,0.016222,0.020005,0.02484,-0.061148,0.01588,0.034496,0.009945,-0.024901,0.002021,0


In [6]:
# SPLITTING TRAIN\TEST SAMPLES

X_train, X_test, y_train, y_test = train_test_split(df_features.iloc[:, :-1], df_features["class"], test_size=0.20, random_state=42, shuffle=True)
y_train, y_test = y_train.astype(int), y_test.astype(int)
X_train, X_test = X_train.astype(float), X_test.astype(float)
X_train, X_test = preprocessing.StandardScaler().fit_transform(X_train), preprocessing.StandardScaler().fit_transform(X_test)
X_train, y_test

(array([[ 0.06091989,  0.74803935,  1.52852962, ...,  0.12723808,
          0.54915438,  0.13086457],
        [ 0.31471053, -0.57911948,  0.18603503, ...,  1.02621107,
          1.4703906 ,  0.77954306],
        [-0.56095502, -0.27487681, -1.2589241 , ..., -1.5739782 ,
         -0.33624449, -0.9574096 ],
        ...,
        [ 0.10018549,  1.39557612,  0.55528058, ..., -0.03634658,
          1.09532672, -0.30622785],
        [ 0.48924609, -0.94552226,  0.76858958, ...,  1.80493491,
         -0.31021141,  1.49582015],
        [ 0.77088237, -1.07485255,  0.8803184 , ...,  0.0533789 ,
          0.85094062,  0.73035258]]),
 24145     0
 42206     0
 198       1
 33474     0
 26193     0
          ..
 41861     0
 25461     0
 133691    0
 111586    0
 68183     0
 Name: class, Length: 29001, dtype: int64)

In [7]:
print(y_train.shape)
print(X_train.shape)

(116003,)
(116003, 300)


In [8]:
def print_report(model, y_true, predictions):
    conf = confusion_matrix(y_true, predictions)
    acc = accuracy_score(y_true, predictions)
    balanced_acc = balanced_accuracy_score(y_true, predictions)
    precision = precision_score(y_true, predictions, zero_division=0)
    recall = recall_score(y_true, predictions, zero_division=0)
    f1 = f1_score(y_true, predictions, zero_division=0)
    print("Results for: ", model)
    print("Confusion matrix:\n", conf)
    print("Accuracy: ", acc)
    print("Balanced accuracy: ", balanced_acc)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)

In [9]:
def test_model_without_cross_validation(model):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print_report(model, y_test, predictions)
    return model
test_model_without_cross_validation(LogisticRegression())

Results for:  LogisticRegression()
Confusion matrix:
 [[25324   420]
 [  736  2521]]
Accuracy:  0.9601393055411882
Balanced accuracy:  0.878855347749352
Precision:  0.8571914314858892
Recall:  0.7740251765428309
F1:  0.813488222007099


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [10]:
params = {
    "DecisionTreeClassifier": {"max_depth":[5, 7, 8, 10],
                              "min_samples_split": [3, 2, 4],
                              "min_samples_leaf": [5, 10, 20],
                              "min_weight_fraction_leaf": [0.5, 0.10, 0.15],
                              "max_features": [5, 7, 10]
                              },
    "RandomForest": {"n_estimators": [20, 50, 100, 120],
                     "max_features": [5, 9, 12],
                     "max_depth": [5, 7, 10, 15],
                     "min_samples_leaf": [5, 10, 15]
                    },
    "KNN":  {'n_neighbors':[9, 10, 12],
             'leaf_size': [20, 30, 50]
             },
    "AdaBoostClassifier": {"n_estimators": [20, 50, 100, 120],
                            "learning_rate": [0.005, 0.01, 0.03]
                            },
    "GradientBoostingClassifier": {"n_estimators": [20, 50, 100, 120],
                            "learning_rate": [0.005, 0.01, 0.03],
                            "min_samples_leaf": [5, 10, 20],
                            "max_features": [5, 7, 10]
                            }


}

In [11]:
def test_estimator(estimator):
    estimator.fit(X_train, y_train.astype(int))
    y_pred_ = estimator.predict(X_test)
    print_report(estimator, y_test, y_pred_)



In [12]:
def test_best_model(model, param_dict, cv=5):
    g = GridSearchCV(model, param_dict, cv=cv).fit(X_train, y_train)
    m = g.best_estimator_
    preds = m.predict(X_test)
    print_report(m, y_test, preds)
    return g.best_estimator_

In [14]:
best_tree_model = test_best_model(DecisionTreeClassifier(),  params["DecisionTreeClassifier"])

Results for:  DecisionTreeClassifier(max_depth=5, max_features=5, min_samples_leaf=5,
                       min_samples_split=3, min_weight_fraction_leaf=0.5)
Confusion matrix:
 [[25744     0]
 [ 3257     0]]
Accuracy:  0.8876935278093859
Balanced accuracy:  0.5
Precision:  0.0
Recall:  0.0
F1:  0.0


In [15]:
test_best_model(KNN(), params["KNN"])


Results for:  KNeighborsClassifier(leaf_size=20, n_neighbors=10)
Confusion matrix:
 [[24940   804]
 [ 1531  1726]]
Accuracy:  0.9194855349815524
Balanced accuracy:  0.7493524727445577
Precision:  0.6822134387351778
Recall:  0.5299355234878723
F1:  0.596509417660273


KNeighborsClassifier(leaf_size=20, n_neighbors=10)

In [16]:
test_best_model(AdaBoostClassifier(best_tree_model),  params["AdaBoostClassifier"])


Results for:  AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                         max_features=5,
                                                         min_samples_leaf=5,
                                                         min_samples_split=3,
                                                         min_weight_fraction_leaf=0.5),
                   learning_rate=0.005, n_estimators=20)
Confusion matrix:
 [[25744     0]
 [ 3257     0]]
Accuracy:  0.8876935278093859
Balanced accuracy:  0.5
Precision:  0.0
Recall:  0.0
F1:  0.0


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                         max_features=5,
                                                         min_samples_leaf=5,
                                                         min_samples_split=3,
                                                         min_weight_fraction_leaf=0.5),
                   learning_rate=0.005, n_estimators=20)

In [None]:
test_best_model(GradientBoostingClassifier(),  params["GradientBoostingClassifier"])

In [None]:
test_best_model(RandomForestClassifier(),  params["RandomForest"])