### Import Libraries and Read in Data

In [53]:
import numpy as np 
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [54]:
df = pd.read_csv("/kaggle/input/cs366-full-data/data_meta_feat_added.csv", index_col=0)

In [55]:
# Reindex for better / easier concatenation later
df.reset_index(inplace=True)
del df['index']

### Prepare Data for ML (e.g. Countvectorize, split data to train and test data etc.)

In [56]:
del df['url'] # drop url column

In [57]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,3),max_features=1000,analyzer='word')
countvec_features = countvec.fit_transform(df['title'])
labels = df['target'].replace({'real':0,'fake':1,'sarcasm':2})

In [58]:
# n-gram features + Meta Features I created
countvec_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(countvec_features.toarray())], axis=1)

In [59]:
# wf-idf features (using sublinear_tf = True where wf = 1+ log(tf))
tfidf = TfidfVectorizer(sublinear_tf = True, ngram_range=(1,3),max_features=1000,analyzer='word')
tfidf_features = tfidf.fit_transform(df['title'])

In [60]:
# wf-idf + Meta Features I created
tfidf_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(tfidf_features.toarray())], axis=1)

In [61]:
# # Split into train and test data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Naive Bayes

#### Just N-gram features

In [62]:
nb_ngram = MultinomialNB()

In [63]:
print("Naive Bayes on n-gram features: CV Accuracy ", cross_val_score(nb_ngram, countvec_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on n-gram features: CV Balanced Accuracy ", cross_val_score(nb_ngram, countvec_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram features: CV AUC Score ", cross_val_score(nb_ngram, countvec_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram features: CV Weighted F1 Score ", cross_val_score(nb_ngram, countvec_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on n-gram features: CV Accuracy  0.6282222167518337
Naive Bayes on n-gram features: CV Balanced Accuracy  0.3401051985307645
Naive Bayes on n-gram features: CV AUC Score  0.5428401162612612
Naive Bayes on n-gram features: CV Weighted F1 Score  0.5384551440750265


#### N-gram features + Meta Features

In [64]:
nb_ngram_meta = MultinomialNB()

In [65]:
print("Naive Bayes on n-gram+meta feats: CV Accuracy ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV AUC Score ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on n-gram+meta feats: CV Accuracy  0.5561773753223165
Naive Bayes on n-gram+meta feats: CV Balanced Accuracy  0.36823982014340906
Naive Bayes on n-gram+meta feats: CV AUC Score  0.5789852713993392
Naive Bayes on n-gram+meta feats: CV Weighted F1 Score  0.54308887350404


#### TF-IDF Features

In [66]:
nb_tfidf = MultinomialNB()

In [67]:
print("Naive Bayes on wfidf feats: CV Accuracy ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on wfidf feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf feats: CV AUC Score ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on wfidf feats: CV Accuracy  0.671736880911342
Naive Bayes on wfidf feats: CV Balanced Accuracy  0.3329821107459642
Naive Bayes on wfidf feats: CV AUC Score  0.5394165027620433
Naive Bayes on wfidf feats: CV Weighted F1 Score  0.5423815953666933


#### TF-IDF Features + Meta Features

In [68]:
nb_tfidf_meta = MultinomialNB()

In [69]:
print("Naive Bayes on wfidf+meta feats: CV Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on wfidf+meta feats: CV Accuracy  0.5837963630275835
Naive Bayes on wfidf+meta feats: CV Balanced Accuracy  0.3522714594544075
Naive Bayes on wfidf+meta feats: CV AUC Score  0.574568637488699
Naive Bayes on wfidf+meta feats: CV Weighted F1 Score  0.5491765055220822


### Logistic Regression

In [73]:
# stdscaler = StandardScaler(with_mean=False)
# countvec_features_scaled = stdscaler.fit_transform(countvec_features)

#### N-gram features

In [74]:
# lr_ngram = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on n-gram features: CV Accuracy ", cross_val_score(lr_ngram, countvec_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on n-gram features: CV Balanced Accuracy ", cross_val_score(lr_ngram, countvec_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on n-gram features: CV AUC Score ", cross_val_score(lr_ngram, countvec_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on n-gram features: CV Weighted F1 Score ", cross_val_score(lr_ngram, countvec_features, labels, scoring='f1_weighted', cv=5).mean())

#### n-gram + meta features

In [75]:
# stdscaler = StandardScaler(with_mean=False)
# countvec_meta_features_scaled = stdscaler.fit_transform(countvec_meta_features)

In [76]:
# lr_ngram_meta = LogisticRegression(multi_class='multinomial', max_iter=1000)

# print("Logistic Regression on n-gram+meta feats: CV Accuracy ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV AUC Score ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [77]:
# lr_tfidf = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on wfidf feats: CV Accuracy ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV AUC Score ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + meta features

In [78]:
# lr_tfidf_meta = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on wfidf+meta feats: CV Accuracy ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV AUC Score ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### QuadraticDiscriminantAnalysis

#### N-gram features

In [79]:
# qda_ngram = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Accuracy ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Balanced Accuracy ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV AUC Score ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Weighted F1 Score ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### N-gram + meta features

In [80]:
# qda_ngram_meta = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Accuracy ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV AUC Score ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [81]:
# qda_tfidf = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Accuracy ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV AUC Score ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + meta features

In [82]:
# qda_tfidf_meta = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Accuracy ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV AUC Score ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### Linear Discriminant Analysis

#### N-gram features

In [71]:
lda_ngram = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on n-gram features: CV Accuracy ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV Balanced Accuracy ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV AUC Score ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV Weighted F1 Score ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on n-gram features: CV Accuracy  0.6515935422784407
LinearDiscriminantAnalysis on n-gram features: CV Balanced Accuracy  0.3311268037839172
LinearDiscriminantAnalysis on n-gram features: CV AUC Score  0.523739537674828
LinearDiscriminantAnalysis on n-gram features: CV Weighted F1 Score  0.5383751902433236


#### N-gram + meta features

In [72]:
lda_ngram_meta = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Accuracy ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV AUC Score ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on n-gram+meta feats: CV Accuracy  0.6451758973368376
LinearDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy  0.33238793564808095
LinearDiscriminantAnalysis on n-gram+meta feats: CV AUC Score  0.5636643421601997
LinearDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score  0.5376750432295665


#### TF-IDF features

In [84]:
lda_tfidf = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on wfidf feats: CV Accuracy ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV AUC Score ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on wfidf feats: CV Accuracy  0.6503695737808607
LinearDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy  0.32969476811838494
LinearDiscriminantAnalysis on wfidf feats: CV AUC Score  0.5255024558299167
LinearDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score  0.5362117584167023


#### TF-IDF + Meta features

In [85]:
lda_tfidf_meta = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Accuracy ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

LinearDiscriminantAnalysis on wfidf+meta feats: CV Accuracy  0.6445454144011216
LinearDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy  0.33244651652215607
LinearDiscriminantAnalysis on wfidf+meta feats: CV AUC Score  0.5648639002922533
LinearDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score  0.5365335356957444


Some issues encountered in other algorithms

- Convergence issue in Logistic Regression 
- Too much computational cost for QDA (kernel die) + collinearity