### Import Libraries and Read in Data

In [5]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# from keras.models import Model, Sequential
# from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D
# from keras.optimizers import RMSprop
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing import sequence
# from keras.utils import to_categorical
# from keras.callbacks import EarlyStopping
# from keras.preprocessing.sequence import pad_sequences

import itertools
import os

%matplotlib inline

In [2]:
# Read in Data
df = pd.read_csv("../input/cs366-full-data/meta_plus_eda_feats_data.csv", index_col=0)

In [3]:
# Row and Column Num of data
df.shape

(53851, 18)

### Prepare Data for ML

In [4]:
del df['url'] # drop url column

In [7]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,3),max_features=1000,analyzer='word')
countvec_features = countvec.fit_transform(df['title'])

In [8]:
# encoded target(y) variable

labels = df['target'].replace({'real':0,'fake':1,'sarcasm':2})

In [9]:
# n-gram features + All Features I created
countvec_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(countvec_features.toarray())], axis=1)

In [13]:
# Shape of data with n-gram features + all features I created
countvec_meta_features.shape

(53851, 1015)

In [31]:
selector = VarianceThreshold(threshold=0.01)
selector.fit(countvec_meta_features)

f = np.vectorize(lambda x : not x) # Function to toggle boolean array elements

v = countvec_meta_features.columns[f(selector.get_support())]

print('{} variables have too low variance.'.format(len(v)))

print('These variables are {}'.format(list(v)))

countvec_meta_features_low_var_rmv = selector.fit_transform(countvec_meta_features) # Features with low variances removed

918 variables have too low variance.
These variables are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 53, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 221, 222, 223, 2

In [25]:
# Shape of data with n-gram features + all features I created with low variance features removed
countvec_meta_features_low_var_rmv.shape

(53851, 97)

### LGBM on new data with low variance features removed

In [26]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(boosting_type='gbdt', n_estimators = 5000, learning_rate=0.03, max_depth=-1,
                     n_jobs=-1,objective='multiclass', random_state=42)

In [27]:
%%time

print("LGBM on ngram feats + feats I created with low variance feats removed: CV Balanced Accuracy ", 
      cross_val_score(lgbm, countvec_meta_features_low_var_rmv, labels, scoring='balanced_accuracy', cv=5).mean())

print("LGBM on ngram feats + feats I created with low variance feats removed: CV AUC Score ", 
      cross_val_score(lgbm, countvec_meta_features_low_var_rmv, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LGBM on ngram feats + feats I created with low variance feats removed: CV Weighted F1 Score ",
      cross_val_score(lgbm, countvec_meta_features_low_var_rmv, labels, scoring='f1_weighted', cv=5).mean())

LGBM on ngram feats + feats I created with low variance feats removed: CV Balanced Accuracy  0.4747349810978941
LGBM on ngram feats + feats I created with low variance feats removed: CV AUC Score  0.7130539233037368
LGBM on ngram feats + feats I created with low variance feats removed: CV Weighted F1 Score  0.6662419447842423
CPU times: user 1h 2min 58s, sys: 30.8 s, total: 1h 3min 29s
Wall time: 16min 19s


### Dealing with Imbalanced Dataset

#### Undersampling with Tomek Links + LGBM

In [37]:
import imblearn

from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')

countvec_meta_features_low_var_rmv_tl, labels_tl = tl.fit_sample(countvec_meta_features_low_var_rmv, labels)

# print('Removed indexes:', id_tl)
# print('Number of removed observations from majority class: ', len(id_tl))

In [40]:
print("{} observations from majority class has been removed: ".format(countvec_meta_features_low_var_rmv.shape[0] - countvec_meta_features_low_var_rmv_tl.shape[0]))

3944 observations from majority class has been removed: 


In [45]:
lgbm2 = LGBMClassifier(boosting_type='gbdt', n_estimators = 5000, learning_rate=0.03, max_depth=-1,
                     n_jobs=-1,objective='multiclass', random_state=42)

In [48]:
%%time

print("LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks: CV Balanced Accuracy ", 
      cross_val_score(lgbm2, countvec_meta_features_low_var_rmv_tl, labels_tl, scoring='balanced_accuracy', cv=5).mean())

print("LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks: CV AUC Score ", 
      cross_val_score(lgbm2, countvec_meta_features_low_var_rmv_tl, labels_tl, scoring='roc_auc_ovr', cv=5).mean())

print("LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks: CV Weighted F1 Score ",
      cross_val_score(lgbm2, countvec_meta_features_low_var_rmv_tl, labels_tl, scoring='f1_weighted', cv=5).mean())

LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks: CV Balanced Accuracy  0.49655579799910504
LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks: CV AUC Score  0.7339314227935876
LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks: CV Weighted F1 Score  0.665369269245436
CPU times: user 57min 26s, sys: 25 s, total: 57min 51s
Wall time: 14min 48s


#### SMOTE + Tomek (combination of over-sampling and under-sampling) + LGBM

In [50]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(sampling_strategy='auto')
countvec_meta_features_low_var_rmv_tl_smt, labels_tl_smt = smt.fit_resample(countvec_meta_features_low_var_rmv_tl, labels_tl)

In [53]:
# Breakdown of observations by different classes

labels_tl_smt.value_counts() # 0: real 1: fake 2: sarcasm (now very balanced)

2    32267
1    32253
0    32213
Name: target, dtype: int64

In [56]:
lgbm3 = LGBMClassifier(boosting_type='gbdt', n_estimators = 5000, learning_rate=0.03, max_depth=-1,
                     n_jobs=-1,objective='multiclass', random_state=42)

In [57]:
%%time

print("LGBM on ngram feats + feats I created with low variance feats removed + under-over-under sampling: CV Balanced Accuracy ", 
      cross_val_score(lgbm3, countvec_meta_features_low_var_rmv_tl_smt, labels_tl_smt, scoring='balanced_accuracy', cv=5).mean())

print("LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks + under-over-under sampling: CV AUC Score ", 
      cross_val_score(lgbm3, countvec_meta_features_low_var_rmv_tl_smt, labels_tl_smt, scoring='roc_auc_ovr', cv=5).mean())

print("LGBM on ngram feats + feats I created with low variance feats removed + under-over-under sampling: CV Weighted F1 Score ",
      cross_val_score(lgbm3, countvec_meta_features_low_var_rmv_tl_smt, labels_tl_smt, scoring='f1_weighted', cv=5).mean())

LGBM on ngram feats + feats I created with low variance feats removed + under-over-under sampling: CV Balanced Accuracy  0.8043558882066403
LGBM on ngram feats + feats I created with low variance feats removed and some majority class removed with TomekLinks + under-over-under sampling: CV AUC Score  0.933481617868944
LGBM on ngram feats + feats I created with low variance feats removed + under-over-under sampling: CV Weighted F1 Score  0.7932983686197146
CPU times: user 2h 49min 48s, sys: 1min 12s, total: 2h 51min 1s
Wall time: 44min 1s
