# PROPAGANDA CLASSIFICATION MODEL OF META FEATURES

## Imports

In [21]:
import numpy as np
import pandas as pd
import en_core_web_sm
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import re
import sklearn

## Loading in Meta-Features

In [2]:
df = pd.read_csv('meta_features.csv')

## Visualizing DataFrame

In [4]:
df.head()

Unnamed: 0,article_id,propaganda,propaganda_type,text,prop_txt_snippet,sent_#,sentiment_score,abs_sent_score,punct_count,word_count,%adj,%verb,%adv,%noun,avg_word_length,strong_subjectives_count
0,701225819,non-propaganda,,South Florida Muslim Leader Sofian Zakkout’s D...,,1,0.0,0.0,0,9,0.0,0.0,0.0,0.0,5.444444,0
1,701225819,propaganda,"Name_Calling,Labeling","David Duke, the white supremacist icon and for...",Grand Wizard of the Ku Klux Klan,2,0.5423,0.5423,4,26,0.020548,0.006849,0.013699,0.006849,4.423077,2
2,701225819,propaganda,Loaded_Language,"However, one individual who represents the Mus...",enamored,3,0.3612,0.3612,4,27,0.017241,0.017241,0.005747,0.022989,5.0,0
3,701225819,non-propaganda,,"Last month, once again, Zakkout chose to showc...",,4,0.0,0.0,5,22,0.021127,0.021127,0.014085,0.035211,5.045455,0
4,701225819,non-propaganda,,The postings can be rivaled only by Zakkout’s ...,,5,0.0,0.0,1,11,0.014493,0.043478,0.014493,0.028986,4.636364,0


## Dropping Non-Meta and Deterministic Columns

In [5]:
meta_df = df.drop(['propaganda_type','text','prop_txt_snippet','sent_#'], axis = 1)

## Previewing Final DataFrame and Missing Values Before Diving In

In [6]:
meta_df.head()

Unnamed: 0,article_id,propaganda,sentiment_score,abs_sent_score,punct_count,word_count,%adj,%verb,%adv,%noun,avg_word_length,strong_subjectives_count
0,701225819,non-propaganda,0.0,0.0,0,9,0.0,0.0,0.0,0.0,5.444444,0
1,701225819,propaganda,0.5423,0.5423,4,26,0.020548,0.006849,0.013699,0.006849,4.423077,2
2,701225819,propaganda,0.3612,0.3612,4,27,0.017241,0.017241,0.005747,0.022989,5.0,0
3,701225819,non-propaganda,0.0,0.0,5,22,0.021127,0.021127,0.014085,0.035211,5.045455,0
4,701225819,non-propaganda,0.0,0.0,1,11,0.014493,0.043478,0.014493,0.028986,4.636364,0


In [7]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15172 entries, 0 to 15171
Data columns (total 12 columns):
article_id                  15172 non-null int64
propaganda                  15172 non-null object
sentiment_score             15172 non-null float64
abs_sent_score              15172 non-null float64
punct_count                 15172 non-null int64
word_count                  15172 non-null int64
%adj                        15172 non-null float64
%verb                       15172 non-null float64
%adv                        15172 non-null float64
%noun                       15172 non-null float64
avg_word_length             15172 non-null float64
strong_subjectives_count    15172 non-null int64
dtypes: float64(7), int64(4), object(1)
memory usage: 1.4+ MB


## Train-Test Split

In [8]:
y = meta_df['propaganda']
X = meta_df.drop('propaganda', axis=1)

In [73]:
y = [1 if label == 'propaganda' else 0 for label in y]

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature Scaling

In [75]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [76]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Dummy Classifier

In [77]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train_scaled, y_train)

dummy_preds = dummy_clf.predict(X_test_scaled)

In [78]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [79]:
# Print the confusion matrix
print(sklearn.metrics.confusion_matrix(y_test, dummy_preds))

# Print the precision and recall, among other metrics
print(sklearn.metrics.classification_report(y_test, dummy_preds, digits=3))

[[3510    0]
 [1497    0]]
              precision    recall  f1-score   support

           0      0.701     1.000     0.824      3510
           1      0.000     0.000     0.000      1497

    accuracy                          0.701      5007
   macro avg      0.351     0.500     0.412      5007
weighted avg      0.491     0.701     0.578      5007



  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
roc_auc_score(y_test, dummy_preds)

0.5

## Logistic Regression

In [98]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [99]:
# Create logistic regression
logistic = linear_model.LogisticRegression()

In [105]:
hyperparam_grid_logistic = {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'class_weight': 'balanced'}


In [106]:
# # Create regularization penalty space
# penalty = ['l1', 'l2']

# # Create regularization hyperparameter distribution using uniform distribution
# C = uniform(loc=0, scale=4)

# # Create hyperparameter options
# hyperparameters = dict(C=C, penalty=penalty)

In [107]:
# Create randomized search 5-fold cross validation and 100 iterations
clf_log = RandomizedSearchCV(logistic, hyperparam_grid_logistic, random_state=1, n_iter=200, cv=5, 
                         verbose=True, n_jobs=-1, scoring = 'f1')

In [109]:
# Fit randomized search
best_model_log = clf_log.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   46.3s finished


In [110]:
# View best hyperparameters
print('Best Penalty:', best_model_log.best_estimator_.get_params()['penalty'])
print('Best C:', best_model_log.best_estimator_.get_params()['C'])
print('Best solver:', best_model_log.best_estimator_.get_params()['solver'])

Best Penalty: l2
Best C: 10000.0
Best solver: sag


In [111]:
# Predict target vector
log_preds = best_model_log.predict(X_test_scaled)

In [112]:
# Print the confusion matrix
print(sklearn.metrics.confusion_matrix(y_test, log_preds))

# Print the precision and recall, among other metrics
print(sklearn.metrics.classification_report(y_test, log_preds, digits=3))

[[3334  176]
 [1249  248]]
              precision    recall  f1-score   support

           0      0.727     0.950     0.824      3510
           1      0.585     0.166     0.258      1497

    accuracy                          0.715      5007
   macro avg      0.656     0.558     0.541      5007
weighted avg      0.685     0.715     0.655      5007



In [113]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, log_preds)

0.5577611062581003

## Random Forest

In [115]:
randomforest = ensemble.RandomForestClassifier()

In [118]:
hyperparam_grid_rf=    {'n_estimators' : list(range(10,101,10)),
    'max_features' : list(range(6,32,5)),
    'criterion':['gini','entropy'],
    'class_weight':['balanced']}


In [119]:
clf_rf = RandomizedSearchCV(randomforest, hyperparam_grid_rf, random_state=1, n_iter=200, cv=5, 
                         verbose=True, n_jobs=-1, scoring = 'f1')

In [120]:
# Fit randomized search
best_model_rf = clf_rf.fit(X_train_scaled, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.2min finished


In [121]:
# View best hyperparameters
print('Best Penalty:', best_model_rf.best_estimator_.get_params()['n_estimators'])
print('Best C:', best_model_rf.best_estimator_.get_params()['max_features'])
print('Best solver:', best_model_rf.best_estimator_.get_params()['criterion'])

Best Penalty: 100
Best C: 11
Best solver: entropy


In [122]:
# Predict target vector
rf_preds = best_model_rf.predict(X_test_scaled)

# Print the confusion matrix
print(sklearn.metrics.confusion_matrix(y_test, rf_preds))

# Print the precision and recall, among other metrics
print(sklearn.metrics.classification_report(y_test, rf_preds, digits=3))

print(roc_auc_score(y_test, rf_preds))

[[3241  269]
 [1106  391]]
              precision    recall  f1-score   support

           0      0.746     0.923     0.825      3510
           1      0.592     0.261     0.363      1497

    accuracy                          0.725      5007
   macro avg      0.669     0.592     0.594      5007
weighted avg      0.700     0.725     0.687      5007

0.5922754340590013


## Gradient Boosted Decision Trees

In [123]:
from sklearn.ensemble import GradientBoostingClassifier


In [131]:
clf_gboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1)

In [132]:
gboost_model = clf_gboost.fit(X_train_scaled, y_train)

In [133]:
# Predict target vector
gboost_preds = gboost_model.predict(X_test_scaled)

# Print the confusion matrix
print(sklearn.metrics.confusion_matrix(y_test, gboost_preds))

# Print the precision and recall, among other metrics
print(sklearn.metrics.classification_report(y_test, gboost_preds, digits=3))

print(roc_auc_score(y_test, gboost_preds))

[[3293  217]
 [1135  362]]
              precision    recall  f1-score   support

           0      0.744     0.938     0.830      3510
           1      0.625     0.242     0.349      1497

    accuracy                          0.730      5007
   macro avg      0.684     0.590     0.589      5007
weighted avg      0.708     0.730     0.686      5007

0.5899968027222537


In [None]:
# hyperparam_grid_gb=    {'n_estimators' : list(range(10,101,10)),
#     'max_features' : list(range(6,32,5)),
#     'criterion':['gini','entropy'],
#     'class_weight':'balanced'}

## Neural Network

In [138]:
# Create first network with Keras
import keras
from keras import layers
from keras import callbacks
from keras import regularizers
from keras import optimizers
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers
from keras.optimizers import SGD
from keras.layers import Dense
from keras.models import Sequential

In [141]:
n_cols = len(X_train.columns)
n_cols

11

In [154]:
network1 = Sequential()

# Add a dropout layer for input layer
network1.add(Dropout(0.2, input_shape=(n_cols,)))
# Add fully connected layer with a ReLU activation function
network1.add(Dense(units=15, activation='relu'))
# Add a dropout layer for previous hidden layer
network1.add(Dropout(0.2))
# Add fully connected layer with a ReLU activation function and L2 regularization
network1.add(Dense(units=15, kernel_regularizer=regularizers.l2(0.01),activation='relu'))


network1.add(Dense(1, activation='softmax'))

In [155]:
network1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [156]:
# Set callback functions to early stop training and save the best model so far
callbacks = [EarlyStopping(monitor='val_loss', patience=20), #if higher loss for 3 consecutive epoch(?), cut-off
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

In [157]:
# Train neural network
history = network1.fit(x = X_train_scaled, # Features
                      y = y_train, # Target
                      epochs=1000, # Number of epochs
                      verbose=2, # Some output
                      batch_size=15, # Number of observations per batch
                      callbacks=callbacks,
                      validation_data=(X_test_scaled, y_test)) # Data for evaluation

Train on 10165 samples, validate on 5007 samples
Epoch 1/1000
 - 2s - loss: 11.0140 - accuracy: 0.2835 - val_loss: 10.7492 - val_accuracy: 0.2990
Epoch 2/1000
 - 2s - loss: 10.9860 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 3/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 4/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 5/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 6/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 7/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 8/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 9/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - val_accuracy: 0.2990
Epoch 10/1000
 - 2s - loss: 10.9859 - accuracy: 0.2835 - val_loss: 10.7489 - v