### Import Libraries and Read in Data

In [28]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# from keras.models import Model, Sequential
# from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D
# from keras.optimizers import RMSprop
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing import sequence
# from keras.utils import to_categorical
# from keras.callbacks import EarlyStopping
# from keras.preprocessing.sequence import pad_sequences

import itertools
import os

%matplotlib inline

In [18]:
# Read in Data
df = pd.read_csv("../input/cs366-full-data/meta_plus_eda_feats_data.csv", index_col=0)

### Prepare Data for ML

In [19]:
del df['url'] # drop url column

In [20]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,3),max_features=1000,analyzer='word')
countvec_features = countvec.fit_transform(df['title'])

In [21]:
# encoded target(y) variable

labels = df['target'].replace({'real':0,'fake':1,'sarcasm':2})

In [22]:
# n-gram features + All Features I created
countvec_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(countvec_features.toarray())], axis=1)

### RandomForest Classifier Baseline

In [29]:
# Random Forest Baseline
rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)

In [30]:
%%time

print("Random Forest Baseline on ngram feats + feats I created: CV Balanced Accuracy ", 
      cross_val_score(rf, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Random Forest Baseline on ngram feats + feats I createds: CV AUC Score ", 
      cross_val_score(rf, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Random Forest Baseline on ngram feats + feats I created: CV Weighted F1 Score ", 
      cross_val_score(rf, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Random Forest Baseline on ngram feats + feats I created: CV Balanced Accuracy  0.3957871567936477
Random Forest Baseline on ngram feats + feats I createds: CV AUC Score  0.6379976977160987
Random Forest Baseline on ngram feats + feats I created: CV Weighted F1 Score  0.6004050140935633
CPU times: user 1min 45s, sys: 14.3 s, total: 1min 59s
Wall time: 30min 11s


### SVC (with SGD)

In [38]:
from sklearn.linear_model import SGDClassifier

lsvc = SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)

In [36]:
# standardize data for better performance for SVC

stdscaler = StandardScaler()
countvec_meta_features_std = stdscaler.fit_transform(countvec_meta_features)

In [40]:
%%time

print("SVC Baseline on standardized ngram feats + feats I created: CV Balanced Accuracy ", 
      cross_val_score(lsvc, countvec_meta_features_std, labels, scoring='balanced_accuracy', cv=5).mean())

print("SVC Baseline on standardized ngram feats + feats I createds: CV AUC Score ", 
      cross_val_score(lsvc, countvec_meta_features_std, labels, scoring='roc_auc_ovr', cv=5).mean())

print("RSVC Baseline on standardized ngram feats + feats I created: CV Weighted F1 Score ", 
      cross_val_score(lsvc, countvec_meta_features_std, labels, scoring='f1_weighted', cv=5).mean())

SVC Baseline on standardized ngram feats + feats I created: CV Balanced Accuracy  0.3433255223821575
SVC Baseline on standardized ngram feats + feats I createds: CV AUC Score  0.5101970954419194
RSVC Baseline on standardized ngram feats + feats I created: CV Weighted F1 Score  0.5360376455809919
CPU times: user 43.1 s, sys: 1.68 s, total: 44.7 s
Wall time: 36.1 s


### LGBM

In [41]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(boosting_type='gbdt', n_estimators = 5000, learning_rate=0.03, max_depth=-1,
                     n_jobs=-1,objective='multiclass', random_state=42)

In [42]:
%%time

print("LGBM Baseline on ngram feats + feats I created: CV Balanced Accuracy ", 
      cross_val_score(lgbm, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LGBM Baseline on ngram feats + feats I createds: CV AUC Score ", 
      cross_val_score(lgbm, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LGBM Baseline on ngram feats + feats I created: CV Weighted F1 Score ",
      cross_val_score(lgbm, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

LGBM Baseline on ngram feats + feats I created: CV Balanced Accuracy  0.4641518562491561
LGBM Baseline on ngram feats + feats I createds: CV AUC Score  0.7065070986436768
LGBM Baseline on ngram feats + feats I created: CV Weighted F1 Score  0.6560459299850352
CPU times: user 1h 54min 9s, sys: 54.5 s, total: 1h 55min 4s
Wall time: 29min 36s
