### Import Libraries and Read in Files

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [3]:
# Read in Data with meta featured in it

df = pd.read_csv("../input/cs366-full-data/data_meta_feat_added_drop_all_missing.csv", index_col=0)

In [4]:
# Reindex for better / easier concatenation later
df.reset_index(inplace=True)
del df['index']

In [5]:
# # Dividing data into real, fake and sarcasm data for easier n-gram visualizations

# df_real = df.copy()[df.target=='real']
# df_fake = df.copy()[df.target=='fake']
# df_sarc = df.copy()[df.target=='sarcasm']

### Creating new features based on EDA

#### Features from title_cc EDA

- Based on the kdeplot for "title_cc", we saw that there are two bins where the density distributions of the three types of news did not match
    - One bin was the 45-60 range where fake news had the highest density for character counts followed by real and sarcastic news. 
    - Another bin was the 60 to 70ish range where sarcastic news had the highest density for character counts followed by real and fake news. We will create two new features that indicate whether the number of characters fall under these bins (1 yes, 0 no).

In [6]:
df['title_cc_45_60'] = np.where((df.title_cc > 45) & (df.title_cc < 60), 1, 0)
df['title_cc_60_70'] = np.where((df.title_cc > 60) & (df.title_cc < 70), 1, 0)

#### Features from word count and unique word count

Similar to how we created features from the kdeplot of title_cc, we focus on the ranges where the distributions of word count and unique word count do not match across the three different types of news

In [7]:
# unique word count 7 to 9 yes or no?
df['title_unique_wc_7_9'] = np.where((df.title_unique_wc >= 7) & (df.title_unique_wc <= 9), 1, 0)

# word count 6 to 9 or no?
df['title_wc_6_9'] = np.where((df.title_wc >= 6) & (df.title_wc <= 9), 1, 0)

#### Features from Average Word Length 

In [8]:
# Title Average Word Length 4.0 - 4.1 yes or no?
df['title_mean_wl_4.0_4.1'] = np.where((df.title_mean_wl >= 4.0) & (df.title_mean_wl <= 4.1), 1, 0)

# Title Average Word Length 4.9 - 5.0 yes or no
df['title_mean_wl_4.9_5.0'] = np.where((df.title_mean_wl >= 4.9) & (df.title_mean_wl <= 5.0), 1, 0)

#### Features from Median Word Length 

In [9]:
df['title_median_wl_5'] = np.where((df.title_median_wl <= 5), 1, 0)

#### Feature from TF-IDF Based Wordcloud for Sarcastic News

In [10]:
# Top 8 words identified to be important words for sarcasm news based on tf-idf weighting

sarcasm_tfidf_words = ["election","as","gop","eliminates","commission","fear","fraud"]

In [11]:
# Making 8 new columns that indicate whether each sarcasm new word is in each title

for sarc_word in sarcasm_tfidf_words:
    df["word_" + sarc_word+"_included"] = np.where(df.title.str.contains(sarc_word), 1, 0)

In [12]:
# Make a new column 
df['sarc_tfidf_wc'] = \
df['word_election_included'] + df['word_as_included'] + df['word_gop_included'] + df['word_eliminates_included'] + df['word_commission_included'] +\
df['word_fear_included'] + df['word_fraud_included']

In [13]:
# Drop the sarcasm word indicator variable
for sarc_word in sarcasm_tfidf_words:
    del df["word_" + sarc_word+"_included"]

### Prepare Data for ML (e.g. Countvectorize, split data to train and test data etc.)

In [14]:
del df['url'] # drop url column

In [15]:
# n-gram features

countvec = CountVectorizer(ngram_range=(1,3),max_features=1000,analyzer='word')
countvec_features = countvec.fit_transform(df['title'])
labels = df['target'].replace({'real':0,'fake':1,'sarcasm':2})

In [16]:
# n-gram features + All Meta Features I created
countvec_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(countvec_features.toarray())], axis=1)

In [17]:
# wf-idf features (using sublinear_tf = True where wf = 1+ log(tf))
tfidf = TfidfVectorizer(sublinear_tf = True, ngram_range=(1,3),max_features=1000,analyzer='word')
tfidf_features = tfidf.fit_transform(df['title'])

In [18]:
# wf-idf + All Meta Features I created
tfidf_meta_features = pd.concat([df.iloc[:,2:], pd.DataFrame(tfidf_features.toarray())], axis=1)

### NB on All Features I created + n-gram features

In [31]:
nb_ngram = MultinomialNB()

print("Naive Bayes on n-gram+all meta feats: CV Balanced Accuracy ", cross_val_score(nb_ngram, countvec_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram+all meta feats: CV AUC Score ", cross_val_score(nb_ngram, countvec_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram+all meta feats: CV Weighted F1 Score ", cross_val_score(nb_ngram, countvec_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on n-gram+all meta feats: CV Balanced Accuracy  0.37002259180718616
Naive Bayes on n-gram+all meta feats: CV AUC Score  0.582400539231516
Naive Bayes on n-gram+all meta feats: CV Weighted F1 Score  0.5417892842699839


### NB on All Features I created + tf-idf features

In [20]:
nb_tfidf_meta = MultinomialNB()


print("Naive Bayes on wfidf+meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV AUC Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf+meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf_meta, tfidf_meta_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on wfidf+meta feats: CV Balanced Accuracy  0.35967446007567505
Naive Bayes on wfidf+meta feats: CV AUC Score  0.5790361777859548
Naive Bayes on wfidf+meta feats: CV Weighted F1 Score  0.5503809449357993


### NB on n-grams feats + meta features I created on which polynomial transformation was applied

In [27]:
poly = PolynomialFeatures(degree=2)
meta_poly_feats = poly.fit_transform(df.iloc[:,2:])

In [28]:
poly_feat_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(df.iloc[:,2:].columns,p) for p in poly.powers_]]
meta_poly_feats = pd.DataFrame(meta_poly_feats, columns = poly_feat_names)

In [30]:
# n-gram features + All Meta Features I created with polynomial transformations
countvec_meta_poly_features = pd.concat([meta_poly_feats, pd.DataFrame(countvec_features.toarray())], axis=1)

In [36]:
print("After polynomial transformation, number of features increased from {} to {}".format(countvec_meta_features.shape[1], countvec_meta_poly_features.shape[1]))

After polynomial transformation, number of features increased from 1015 to 1136


In [32]:
nb_ngram2 = MultinomialNB()

print("Naive Bayes on n-gram+polynomial meta feats: CV Balanced Accuracy ", cross_val_score(nb_ngram2, countvec_meta_poly_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on n-gram+polynomial meta feats: CV AUC Score ", cross_val_score(nb_ngram2, countvec_meta_poly_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on n-gram+polynomial meta feats: CV Weighted F1 Score ", cross_val_score(nb_ngram2, countvec_meta_poly_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on n-gram+polynomial meta feats: CV Balanced Accuracy  0.37048923875597717
Naive Bayes on n-gram+polynomial meta feats: CV AUC Score  0.5471220450193515
Naive Bayes on n-gram+polynomial meta feats: CV Weighted F1 Score  0.46921090807563426


### NB on tf-idf feats + meta features I created on which polynomial transformation was applied

In [38]:
# tf-idf features + All Meta Features I created with polynomial transformations
tfidf_meta_poly_features = pd.concat([meta_poly_feats, pd.DataFrame(tfidf_features.toarray())], axis=1)

In [39]:
print("After polynomial transformation, number of features increased from {} to {}".format(tfidf_meta_features.shape[1], tfidf_meta_poly_features.shape[1]))

After polynomial transformation, number of features increased from 1015 to 1136


In [40]:
nb_tfidf_meta2 = MultinomialNB()


print("Naive Bayes on wfidf+polynomial transformed meta feats: CV Balanced Accuracy ", 
      cross_val_score(nb_tfidf_meta2, tfidf_meta_poly_features, labels, scoring='balanced_accuracy', cv=5).mean())

print("Naive Bayes on wfidf+polynomial transformed meta feats: CV AUC Score ", 
      cross_val_score(nb_tfidf_meta2, tfidf_meta_poly_features, labels, scoring='roc_auc_ovr', cv=5).mean())

print("Naive Bayes on wfidf+polynomial transformed meta feats: CV Weighted F1 Score ", 
      cross_val_score(nb_tfidf_meta2, tfidf_meta_poly_features, labels, scoring='f1_weighted', cv=5).mean())

Naive Bayes on wfidf+polynomial transformed meta feats: CV Balanced Accuracy  0.3701961703893667
Naive Bayes on wfidf+polynomial transformed meta feats: CV AUC Score  0.5469537358149904
Naive Bayes on wfidf+polynomial transformed meta feats: CV Weighted F1 Score  0.4690564379517725
