### Import Libraries and Read in Data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../input/cs366-full-data/data_meta_feat_added_drop_all_missing.csv", index_col=0)

In [3]:
# Reindex for better / easier concatenation later
df.reset_index(inplace=True)
del df['index']

### Prepare Data for ML (e.g. Countvectorize, split data to train and test data etc.)

In [4]:
del df['url'] # drop url column

In [5]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,3),max_features=1000,analyzer='word')
countvec_features = countvec.fit_transform(df['title'])
labels = df['target'].replace({'real':0,'fake':1,'sarcasm':2})

In [6]:
# n-gram features + Meta Features I created
countvec_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(countvec_features.toarray())], axis=1)

In [7]:
# wf-idf features (using sublinear_tf = True where wf = 1+ log(tf))
tfidf = TfidfVectorizer(sublinear_tf = True, ngram_range=(1,3),max_features=1000,analyzer='word')
tfidf_features = tfidf.fit_transform(df['title'])

In [8]:
# wf-idf + Meta Features I created
tfidf_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(tfidf_features.toarray())], axis=1)

In [9]:
# # Split into train and test data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Naive Bayes

#### Just N-gram features

In [10]:
nb_ngram = MultinomialNB()

In [11]:
print("Naive Bayes on n-gram features: CV Accuracy ", cross_val_score(nb_ngram, countvec_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on n-gram features: CV Balanced Accuracy ", cross_val_score(nb_ngram, countvec_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram features: CV AUC Score ", cross_val_score(nb_ngram, countvec_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram features: CV Weighted F1 Score ", cross_val_score(nb_ngram, countvec_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on n-gram features: CV Accuracy  0.630314897795906
Naive Bayes on n-gram features: CV Balanced Accuracy  0.3408914655863888
Naive Bayes on n-gram features: CV AUC Score  0.5444357469317371
Naive Bayes on n-gram features: CV Weighted F1 Score  0.5384413563858288


#### N-gram features + Meta Features

In [12]:
nb_ngram_meta = MultinomialNB()

In [13]:
print("Naive Bayes on n-gram+meta feats: CV Accuracy ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV AUC Score ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on n-gram+meta feats: CV Accuracy  0.5578744293176242
Naive Bayes on n-gram+meta feats: CV Balanced Accuracy  0.3671380346130233
Naive Bayes on n-gram+meta feats: CV AUC Score  0.5796555178159668
Naive Bayes on n-gram+meta feats: CV Weighted F1 Score  0.5415781097603072


#### TF-IDF Features

In [14]:
nb_tfidf = MultinomialNB()

In [15]:
print("Naive Bayes on wfidf feats: CV Accuracy ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on wfidf feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf feats: CV AUC Score ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on wfidf feats: CV Accuracy  0.6714454103046912
Naive Bayes on wfidf feats: CV Balanced Accuracy  0.3332080432538941
Naive Bayes on wfidf feats: CV AUC Score  0.5409808512265
Naive Bayes on wfidf feats: CV Weighted F1 Score  0.5420834728499294


#### TF-IDF Features + Meta Features

In [16]:
nb_tfidf_meta = MultinomialNB()

In [17]:
print("Naive Bayes on wfidf+meta feats: CV Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on wfidf+meta feats: CV Accuracy  0.5868055536518801
Naive Bayes on wfidf+meta feats: CV Balanced Accuracy  0.35457579083974283
Naive Bayes on wfidf+meta feats: CV AUC Score  0.5749725490750386
Naive Bayes on wfidf+meta feats: CV Weighted F1 Score  0.5487799777436372


### Logistic Regression

In [18]:
# stdscaler = StandardScaler(with_mean=False)
# countvec_features_scaled = stdscaler.fit_transform(countvec_features)

#### N-gram features

In [19]:
# lr_ngram = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on n-gram features: CV Accuracy ", cross_val_score(lr_ngram, countvec_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on n-gram features: CV Balanced Accuracy ", cross_val_score(lr_ngram, countvec_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on n-gram features: CV AUC Score ", cross_val_score(lr_ngram, countvec_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on n-gram features: CV Weighted F1 Score ", cross_val_score(lr_ngram, countvec_features, labels, scoring='f1_weighted', cv=5).mean())

#### n-gram + meta features

In [20]:
# stdscaler = StandardScaler(with_mean=False)
# countvec_meta_features_scaled = stdscaler.fit_transform(countvec_meta_features)

In [21]:
# lr_ngram_meta = LogisticRegression(multi_class='multinomial', max_iter=1000)

# print("Logistic Regression on n-gram+meta feats: CV Accuracy ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV AUC Score ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [22]:
# lr_tfidf = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on wfidf feats: CV Accuracy ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV AUC Score ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + meta features

In [23]:
# lr_tfidf_meta = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on wfidf+meta feats: CV Accuracy ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV AUC Score ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### QuadraticDiscriminantAnalysis

#### N-gram features

In [24]:
# qda_ngram = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Accuracy ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Balanced Accuracy ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV AUC Score ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Weighted F1 Score ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### N-gram + meta features

In [25]:
# qda_ngram_meta = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Accuracy ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV AUC Score ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [26]:
# qda_tfidf = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Accuracy ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV AUC Score ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + meta features

In [27]:
# qda_tfidf_meta = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Accuracy ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV AUC Score ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### Linear Discriminant Analysis

#### N-gram features

In [28]:
lda_ngram = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on n-gram features: CV Accuracy ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV Balanced Accuracy ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV AUC Score ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV Weighted F1 Score ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on n-gram features: CV Accuracy  0.6508525325103939
LinearDiscriminantAnalysis on n-gram features: CV Balanced Accuracy  0.3309978383575979
LinearDiscriminantAnalysis on n-gram features: CV AUC Score  0.5264214242801442
LinearDiscriminantAnalysis on n-gram features: CV Weighted F1 Score  0.5379412536136597


#### N-gram + meta features

In [29]:
lda_ngram_meta = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Accuracy ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV AUC Score ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on n-gram+meta feats: CV Accuracy  0.6447248194820043
LinearDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy  0.3322563219494901
LinearDiscriminantAnalysis on n-gram+meta feats: CV AUC Score  0.5650787341357901
LinearDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score  0.537384628649013


#### TF-IDF features

In [30]:
lda_tfidf = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on wfidf feats: CV Accuracy ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV AUC Score ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on wfidf feats: CV Accuracy  0.6509641376001294
LinearDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy  0.3306102812515366
LinearDiscriminantAnalysis on wfidf feats: CV AUC Score  0.5270385204660031
LinearDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score  0.5366771950899587


#### TF-IDF + Meta features

In [31]:
lda_tfidf_meta = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Accuracy ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on wfidf+meta feats: CV Accuracy  0.6439264740503468
LinearDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy  0.33185774860546413
LinearDiscriminantAnalysis on wfidf+meta feats: CV AUC Score  0.5656580533258846
LinearDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score  0.535679496652975


Some issues encountered in other algorithms

- Convergence issue in Logistic Regression 
- Too much computational cost for QDA (kernel die) + collinearity