# Summary

For Task 2 (Sentence classification), we decided to develop multiple models independently and ensemble them into one final classifier. <br>We tried both <b>Stacking</b> via Logistic Regression and <b>Majority Voting</b> as ensembling strategies. We chose to go with <b>Stacking</b> as it performed better on the DEV set.

# Components

In short, we developed the following 4 components:
1. Random Forest classifier over Glove-based sentence embeddings
2. Neural Net with LSTM layer
3. XGB classifier over W2V-based sentence embeddings
4. Predictions of Task 3 Model (Neural Net)

# Import Components

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

path = r"C:\Projects\Propaganda Detection\FlipFlops_Propaganda_Project"

In [2]:
mod_1_rf_train = pd.read_pickle(os.path.join(path,"modelling","task2","result_task2_train_niki.pkl"))
print(mod_1_rf_train.shape)
mod_1_rf_train.head(10)

(15170, 4)


Unnamed: 0,aid,sid,rf_200d_slen_SCORE,rf_200d_slen_PRED
0,111111112,1,0.416656,0
1,111111112,2,0.427768,0
2,111111112,3,0.431391,0
3,111111112,4,0.427768,0
4,111111112,5,0.503309,1
5,111111112,6,0.427768,0
6,111111112,7,0.45419,0
7,111111112,8,0.427768,0
8,111111112,9,0.504925,1
9,111111112,10,0.427768,0


In [5]:
mod_23_nn_xgb_train = pd.read_csv(os.path.join(path,"modelling","task2","task2_train_scoring.csv"))
print(mod_23_nn_xgb_train.shape)
mod_23_nn_xgb_train.head(10)

(15170, 4)


Unnamed: 0,article_id,N_sentence,lstm_v_sent,xgb_v_sent
0,111111112,1,0.00022,0.163508
1,111111112,2,0.430642,0.137205
2,111111112,3,0.000388,0.222913
3,111111112,4,0.430642,0.137205
4,111111112,5,0.999856,0.678963
5,111111112,6,0.430642,0.137205
6,111111112,7,1.1e-05,0.260172
7,111111112,8,0.430642,0.137205
8,111111112,9,0.000744,0.23522
9,111111112,10,0.430642,0.137205


In [16]:
mod_4_nn_train = pd.read_csv(os.path.join(path,"modelling","task2","result_task2_TRAIN_mitko.txt"),
                             sep="\t",names=["aid","sid","mod_4_nn_PRED"])
print(mod_4_nn_train.shape)
mod_4_nn_train.head(10)

(15170, 3)


Unnamed: 0,aid,sid,mod_4_nn_PRED
0,111111112,1,propaganda
1,111111112,2,propaganda
2,111111112,3,propaganda
3,111111112,4,propaganda
4,111111112,5,propaganda
5,111111112,6,propaganda
6,111111112,7,propaganda
7,111111112,8,propaganda
8,111111112,9,propaganda
9,111111112,10,propaganda


In [9]:
mod_4_nn_train["mod_4_nn_PRED"].value_counts()

propaganda        14956
non-propaganda      214
Name: mod_4_nn_PRED, dtype: int64

# Import Performance

In [11]:
path2 = r"C:\Projects\Propaganda Detection"
task2_train_flag = pd.read_pickle(os.path.join(path2,"Data","task2_train_sent_flag.pkl"))
print(task2_train_flag.shape)
task2_train_flag.head(10)

(15170, 7)


Unnamed: 0,aid,sid,sentence,flag,flag_bin,slen_ch,slen_w
0,111111112,1,US bloggers banned from entering UK,non-propaganda,0,35,6
1,111111112,2,,non-propaganda,0,0,0
2,111111112,3,Two prominent US bloggers have been banned fro...,non-propaganda,0,90,16
3,111111112,4,,non-propaganda,0,0,0
4,111111112,5,Pamela Geller and Robert Spencer co-founded an...,propaganda,1,91,14
5,111111112,6,,non-propaganda,0,0,0
6,111111112,7,They were due to speak at an English Defence L...,non-propaganda,0,106,19
7,111111112,8,,non-propaganda,0,0,0
8,111111112,9,A government spokesman said individuals whose ...,non-propaganda,0,133,20
9,111111112,10,,non-propaganda,0,0,0


# Evaluate Individual Components

In [19]:
mod_1_rf_train["key"] = mod_1_rf_train["aid"].map(str)+"_"+mod_1_rf_train["sid"].map(str)
mod_23_nn_xgb_train["key"] = mod_23_nn_xgb_train["article_id"].map(str)+"_"+mod_23_nn_xgb_train["N_sentence"].map(str)
mod_4_nn_train["key"] = mod_4_nn_train["aid"].map(str)+"_"+mod_4_nn_train["sid"].map(str)
task2_train_flag["key"] = task2_train_flag["aid"].map(str)+"_"+task2_train_flag["sid"]
task2_train_flag.head()

Unnamed: 0,aid,sid,sentence,flag,flag_bin,slen_ch,slen_w,key
0,111111112,1,US bloggers banned from entering UK,non-propaganda,0,35,6,111111112_1
1,111111112,2,,non-propaganda,0,0,0,111111112_2
2,111111112,3,Two prominent US bloggers have been banned fro...,non-propaganda,0,90,16,111111112_3
3,111111112,4,,non-propaganda,0,0,0,111111112_4
4,111111112,5,Pamela Geller and Robert Spencer co-founded an...,propaganda,1,91,14,111111112_5


In [20]:
mod_1234_train = mod_1_rf_train.set_index("key")[["rf_200d_slen_SCORE","rf_200d_slen_PRED"]].join(task2_train_flag.set_index("key"))
print(mod_1234_train.shape)
mod_1234_train.head(10)

(15170, 9)


Unnamed: 0_level_0,rf_200d_slen_SCORE,rf_200d_slen_PRED,aid,sid,sentence,flag,flag_bin,slen_ch,slen_w
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
111111112_1,0.416656,0,111111112,1,US bloggers banned from entering UK,non-propaganda,0,35,6
111111112_2,0.427768,0,111111112,2,,non-propaganda,0,0,0
111111112_3,0.431391,0,111111112,3,Two prominent US bloggers have been banned fro...,non-propaganda,0,90,16
111111112_4,0.427768,0,111111112,4,,non-propaganda,0,0,0
111111112_5,0.503309,1,111111112,5,Pamela Geller and Robert Spencer co-founded an...,propaganda,1,91,14
111111112_6,0.427768,0,111111112,6,,non-propaganda,0,0,0
111111112_7,0.45419,0,111111112,7,They were due to speak at an English Defence L...,non-propaganda,0,106,19
111111112_8,0.427768,0,111111112,8,,non-propaganda,0,0,0
111111112_9,0.504925,1,111111112,9,A government spokesman said individuals whose ...,non-propaganda,0,133,20
111111112_10,0.427768,0,111111112,10,,non-propaganda,0,0,0


In [23]:
mod_1234_train = mod_1234_train.join(mod_23_nn_xgb_train.set_index("key")[["lstm_v_sent","xgb_v_sent"]])
mod_1234_train.head(10)

Unnamed: 0_level_0,rf_200d_slen_SCORE,rf_200d_slen_PRED,aid,sid,sentence,flag,flag_bin,slen_ch,slen_w,lstm_v_sent,xgb_v_sent
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
111111112_1,0.416656,0,111111112,1,US bloggers banned from entering UK,non-propaganda,0,35,6,0.00022,0.163508
111111112_2,0.427768,0,111111112,2,,non-propaganda,0,0,0,0.430642,0.137205
111111112_3,0.431391,0,111111112,3,Two prominent US bloggers have been banned fro...,non-propaganda,0,90,16,0.000388,0.222913
111111112_4,0.427768,0,111111112,4,,non-propaganda,0,0,0,0.430642,0.137205
111111112_5,0.503309,1,111111112,5,Pamela Geller and Robert Spencer co-founded an...,propaganda,1,91,14,0.999856,0.678963
111111112_6,0.427768,0,111111112,6,,non-propaganda,0,0,0,0.430642,0.137205
111111112_7,0.45419,0,111111112,7,They were due to speak at an English Defence L...,non-propaganda,0,106,19,1.1e-05,0.260172
111111112_8,0.427768,0,111111112,8,,non-propaganda,0,0,0,0.430642,0.137205
111111112_9,0.504925,1,111111112,9,A government spokesman said individuals whose ...,non-propaganda,0,133,20,0.000744,0.23522
111111112_10,0.427768,0,111111112,10,,non-propaganda,0,0,0,0.430642,0.137205


In [25]:
mod_1234_train = mod_1234_train.join(mod_4_nn_train.set_index("key")[["mod_4_nn_PRED"]])
mod_1234_train.head(10)

Unnamed: 0_level_0,rf_200d_slen_SCORE,rf_200d_slen_PRED,aid,sid,sentence,flag,flag_bin,slen_ch,slen_w,lstm_v_sent,xgb_v_sent,mod_4_nn_PRED
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
111111112_1,0.416656,0,111111112,1,US bloggers banned from entering UK,non-propaganda,0,35,6,0.00022,0.163508,propaganda
111111112_2,0.427768,0,111111112,2,,non-propaganda,0,0,0,0.430642,0.137205,propaganda
111111112_3,0.431391,0,111111112,3,Two prominent US bloggers have been banned fro...,non-propaganda,0,90,16,0.000388,0.222913,propaganda
111111112_4,0.427768,0,111111112,4,,non-propaganda,0,0,0,0.430642,0.137205,propaganda
111111112_5,0.503309,1,111111112,5,Pamela Geller and Robert Spencer co-founded an...,propaganda,1,91,14,0.999856,0.678963,propaganda
111111112_6,0.427768,0,111111112,6,,non-propaganda,0,0,0,0.430642,0.137205,propaganda
111111112_7,0.45419,0,111111112,7,They were due to speak at an English Defence L...,non-propaganda,0,106,19,1.1e-05,0.260172,propaganda
111111112_8,0.427768,0,111111112,8,,non-propaganda,0,0,0,0.430642,0.137205,propaganda
111111112_9,0.504925,1,111111112,9,A government spokesman said individuals whose ...,non-propaganda,0,133,20,0.000744,0.23522,propaganda
111111112_10,0.427768,0,111111112,10,,non-propaganda,0,0,0,0.430642,0.137205,propaganda


In [27]:
mod_1234_train["mod_4_nn_SCORE"] = (mod_1234_train["mod_4_nn_PRED"]=="propaganda").map(int)
mod_1234_train.head(10)

Unnamed: 0_level_0,rf_200d_slen_SCORE,rf_200d_slen_PRED,aid,sid,sentence,flag,flag_bin,slen_ch,slen_w,lstm_v_sent,xgb_v_sent,mod_4_nn_PRED,mod_4_nn_SCORE
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
111111112_1,0.416656,0,111111112,1,US bloggers banned from entering UK,non-propaganda,0,35,6,0.00022,0.163508,propaganda,1
111111112_2,0.427768,0,111111112,2,,non-propaganda,0,0,0,0.430642,0.137205,propaganda,1
111111112_3,0.431391,0,111111112,3,Two prominent US bloggers have been banned fro...,non-propaganda,0,90,16,0.000388,0.222913,propaganda,1
111111112_4,0.427768,0,111111112,4,,non-propaganda,0,0,0,0.430642,0.137205,propaganda,1
111111112_5,0.503309,1,111111112,5,Pamela Geller and Robert Spencer co-founded an...,propaganda,1,91,14,0.999856,0.678963,propaganda,1
111111112_6,0.427768,0,111111112,6,,non-propaganda,0,0,0,0.430642,0.137205,propaganda,1
111111112_7,0.45419,0,111111112,7,They were due to speak at an English Defence L...,non-propaganda,0,106,19,1.1e-05,0.260172,propaganda,1
111111112_8,0.427768,0,111111112,8,,non-propaganda,0,0,0,0.430642,0.137205,propaganda,1
111111112_9,0.504925,1,111111112,9,A government spokesman said individuals whose ...,non-propaganda,0,133,20,0.000744,0.23522,propaganda,1
111111112_10,0.427768,0,111111112,10,,non-propaganda,0,0,0,0.430642,0.137205,propaganda,1


# Model

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

In [29]:
#Train-Test split
train_ids = np.random.choice(mod_1234_train["aid"].unique(),size=220,replace=False)
train_ids

array([ 729303442,  999000870,  787668628,  729670169,  789615291,
        999000145,  770945799,  725498022,  701225819,  754508491,
        777488669,  741655444,  999000880,  778139122,  999001621,
        763761219,  764518567,  758472954,  757843275,  703806098,
        729578579,  721890296,  761564397,  999000159,  712382330,
        729348908,  758756657,  788900262,  795693029,  736231219,
        758882558, 7651970399,  762206044,  761610997,  727736557,
        111111112,  999001226,  711566593,  732154721,  771406408,
        790665855,  730222442,  999000147,  733754480, 7709564349,
        724095598,  738060046,  111111137,  730573740,  787759779,
        770376380,  718312499,  701553469,  790266787,  764609985,
        754111899,  757243988,  725238842,  774145019,  741802985,
        789121798,  698018235,  706636401,  731178960,  770156851,
        761546223,  758812201,  705409419,  762147609,  700551604,
        782086447,  728343601,  755459860,  729940206,  790720

In [30]:
model_vars = ["rf_200d_slen_SCORE","lstm_v_sent","xgb_v_sent","mod_4_nn_SCORE"]

X_train = mod_1234_train.loc[mod_1234_train["aid"].isin(train_ids)][model_vars]
y_train = mod_1234_train.loc[mod_1234_train["aid"].isin(train_ids)]["flag_bin"]

X_test = mod_1234_train.loc[~mod_1234_train["aid"].isin(train_ids)][model_vars]
y_test = mod_1234_train.loc[~mod_1234_train["aid"].isin(train_ids)]["flag_bin"]

print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(11591, 4) (11591,) (3579, 4) (3579,)


In [31]:
ensemble_v1 = LogisticRegression(C=5)
ensemble_v1.fit(X_train,y_train)

print("F1 Train:",f1_score(y_train,ensemble_v1.predict(X_train)))
print("F1 Test:",f1_score(y_test,ensemble_v1.predict(X_test)))

print("Gini Train:",2*roc_auc_score(y_train,ensemble_v1.predict_proba(X_train)[:,1])-1)
print("Gini Test:",2*roc_auc_score(y_test,ensemble_v1.predict_proba(X_test)[:,1])-1)

F1 Train: 0.8349480359344725
F1 Test: 0.8365384615384616
Gini Train: 0.8793627823045815
Gini Test: 0.883550721160087


In [33]:
with open(os.path.join(path2,"Models","ensemble_v1.pkl"),"wb") as f:
    pickle.dump(ensemble_v1,f,protocol=-1)