### Import Libraries and Read in Data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("/kaggle/input/cs366-full-data/data_meta_feat_added_drop_all_missing.csv", index_col=0)

FileNotFoundError: [Errno 2] File b'/kaggle/input/data_meta_feat_added_drop_all_missing.csv' does not exist: b'/kaggle/input/data_meta_feat_added_drop_all_missing.csv'

In [None]:
# Reindex for better / easier concatenation later
df.reset_index(inplace=True)
del df['index']

### Prepare Data for ML (e.g. Countvectorize, split data to train and test data etc.)

In [None]:
del df['url'] # drop url column

In [None]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,3),max_features=1000,analyzer='word')
countvec_features = countvec.fit_transform(df['title'])
labels = df['target'].replace({'real':0,'fake':1,'sarcasm':2})

In [None]:
# n-gram features + Meta Features I created
countvec_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(countvec_features.toarray())], axis=1)

In [None]:
# wf-idf features (using sublinear_tf = True where wf = 1+ log(tf))
tfidf = TfidfVectorizer(sublinear_tf = True, ngram_range=(1,3),max_features=1000,analyzer='word')
tfidf_features = tfidf.fit_transform(df['title'])

In [None]:
# wf-idf + Meta Features I created
tfidf_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(tfidf_features.toarray())], axis=1)

In [None]:
# # Split into train and test data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Naive Bayes

#### Just N-gram features

In [None]:
nb_ngram = MultinomialNB()

In [None]:
print("Naive Bayes on n-gram features: CV Accuracy ", cross_val_score(nb_ngram, countvec_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on n-gram features: CV Balanced Accuracy ", cross_val_score(nb_ngram, countvec_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram features: CV AUC Score ", cross_val_score(nb_ngram, countvec_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram features: CV Weighted F1 Score ", cross_val_score(nb_ngram, countvec_features, labels, scoring='f1_weighted', cv=5).mean())

#### N-gram features + Meta Features

In [None]:
nb_ngram_meta = MultinomialNB()

In [None]:
print("Naive Bayes on n-gram+meta feats: CV Accuracy ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV AUC Score ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_ngram_meta, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF Features

In [None]:
nb_tfidf = MultinomialNB()

In [None]:
print("Naive Bayes on wfidf feats: CV Accuracy ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on wfidf feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf feats: CV AUC Score ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf, tfidf_features, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF Features + Meta Features

In [None]:
nb_tfidf_meta = MultinomialNB()

In [None]:
print("Naive Bayes on wfidf+meta feats: CV Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### Logistic Regression

In [None]:
# stdscaler = StandardScaler(with_mean=False)
# countvec_features_scaled = stdscaler.fit_transform(countvec_features)

#### N-gram features

In [None]:
# lr_ngram = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on n-gram features: CV Accuracy ", cross_val_score(lr_ngram, countvec_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on n-gram features: CV Balanced Accuracy ", cross_val_score(lr_ngram, countvec_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on n-gram features: CV AUC Score ", cross_val_score(lr_ngram, countvec_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on n-gram features: CV Weighted F1 Score ", cross_val_score(lr_ngram, countvec_features, labels, scoring='f1_weighted', cv=5).mean())

#### n-gram + meta features

In [None]:
# stdscaler = StandardScaler(with_mean=False)
# countvec_meta_features_scaled = stdscaler.fit_transform(countvec_meta_features)

In [None]:
# lr_ngram_meta = LogisticRegression(multi_class='multinomial', max_iter=1000)

# print("Logistic Regression on n-gram+meta feats: CV Accuracy ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV AUC Score ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on n-gram+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_ngram_meta, countvec_meta_features_scaled, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [None]:
# lr_tfidf = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on wfidf feats: CV Accuracy ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV AUC Score ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on wfidf feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_tfidf, tfidf_features, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + meta features

In [None]:
# lr_tfidf_meta = LogisticRegression(multi_class='multinomial', max_iter=500)

# print("Logistic Regression on wfidf+meta feats: CV Accuracy ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV AUC Score ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("Logistic Regression on wfidf+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(lr_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### QuadraticDiscriminantAnalysis

#### N-gram features

In [None]:
# qda_ngram = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Accuracy ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Balanced Accuracy ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV AUC Score ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram features: CV Weighted F1 Score ", 
#       cross_val_score(qda_ngram, countvec_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### N-gram + meta features

In [None]:
# qda_ngram_meta = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Accuracy ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV AUC Score ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_ngram_meta, countvec_meta_features.values, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [None]:
# qda_tfidf = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Accuracy ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV AUC Score ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_tfidf, tfidf_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + meta features

In [None]:
# qda_tfidf_meta = QuadraticDiscriminantAnalysis()

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Accuracy ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV AUC Score ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

# print("QuadraticDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score ", 
#       cross_val_score(qda_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

### Linear Discriminant Analysis

#### N-gram features

In [None]:
lda_ngram = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on n-gram features: CV Accuracy ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV Balanced Accuracy ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV AUC Score ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram features: CV Weighted F1 Score ", 
      cross_val_score(lda_ngram, countvec_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### N-gram + meta features

In [None]:
lda_ngram_meta = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Accuracy ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Balanced Accuracy ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV AUC Score ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on n-gram+meta feats: CV Weighted F1 Score ", 
      cross_val_score(lda_ngram_meta, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF features

In [None]:
lda_tfidf = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on wfidf feats: CV Accuracy ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV Balanced Accuracy ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV AUC Score ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf feats: CV Weighted F1 Score ", 
      cross_val_score(lda_tfidf, tfidf_features.toarray(), labels, scoring='f1_weighted', cv=5).mean())

#### TF-IDF + Meta features

In [None]:
lda_tfidf_meta = LinearDiscriminantAnalysis()

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Accuracy ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("LinearDiscriminantAnalysis on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(lda_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Some issues encountered in other algorithms

- Convergence issue in Logistic Regression 
- Too much computational cost for QDA (kernel die) + collinearity