In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import itertools
from sklearn.linear_model import PassiveAggressiveClassifier

In [2]:
df = pd.read_csv('/Users/somyatripathi/Desktop/nlp/dataset/fake_news/train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
##get independent features
x = df.drop('label', axis=1)

In [5]:
x.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [6]:
##get dependent features
y = df['label']

In [7]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [8]:
df.shape

(20800, 5)

In [9]:
df = df.dropna()

In [10]:
message = df.copy()

In [11]:
message.reset_index(inplace=True)

In [None]:
ps = PorterStemmer()
corpus = []
for i in tqdm(range(0,len(message))):
    review = re.sub('[^a-zA-Z]', ' ', message['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

 18%|███████████████████████████▌                                                                                                                              | 3277/18285 [00:06<00:30, 497.33it/s]

In [None]:
corpus

In [None]:
##Applying counter vector 
##creating bag of word model
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()

In [None]:
X.shape

In [None]:
y = message['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25,random_state=0)

In [None]:
cv.get_feature_names()[:20]

In [None]:
cv.get_params()

In [None]:
count_df = pd.DataFrame(X_train,columns=cv.get_feature_names())

In [None]:
count_df.head()

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
##MunltinomialNB Algorithm
classifier = MultinomialNB()

In [None]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
score = metrics.accuracy_score(y_test,y_pred)

In [None]:
print("accuracy %0.3f" %score)

In [None]:
cm = metrics.confusion_matrix(y_test,y_pred)
plot_confusion_matrix(cm, classes = ['Fake','Real'])

In [None]:
###Multinomial Classifier with Hyperparameter


In [None]:
classifier = MultinomialNB(alpha = 0.1)

In [None]:
previous_score = 0
for alpha in np.arange(0,1,0.1):
    sub_classifier = MultinomialNB(alpha)
    model = sub_classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = metrics.accuracy_score(y_test,y_pred)
    if score>previous_score:
        previous_score = score
        classifier = sub_classifier
    print('alpha: {} and score: {}'.format(alpha,score))    

In [None]:
feature_name = cv.get_feature_names()

In [None]:
classifier.coef_[0]

In [None]:
##most real
sorted(zip(classifier.coef_[0],feature_name),reverse=True)[:20]

In [None]:
##most fake
sorted(zip(classifier.coef_[0],feature_name))[:20]

In [None]:
##PassiveAggressiveClassifier

In [None]:
linear_clss = PassiveAggressiveClassifier(max_iter=50)

In [None]:
model1 = linear_clss.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
score1 = metrics.accuracy_score(y_test,y_pred1)
print("accuracy %0.3f" %score)
cm1 = metrics.confusion_matrix(y_test,y_pred1)
plot_confusion_matrix(cm1, classes = ['Fake','Real'])