In [None]:
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix
import numpy as np # linear algebra
import pandas as pd #data processing
import re

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [None]:
#print(train.shape, test.shape)
print("Shape of Training data: ", train.shape)
print("Shape of Testing data: ", test.shape)

In [None]:
class_labels  = train['label'].values.tolist()
class_labels_set = set(class_labels)

freq_list = []

for c in class_labels_set:
    freq_list.append(class_labels.count(c))

print ('Freq',freq_list)
print ('number',class_labels_set)

In [None]:
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("darkgrid")

In [None]:
# plot the number of samples per class
import seaborn as sns
plt.figure(figsize=(8,8))
sns.countplot(x="label", data=train)

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.columns

## Check number of NULL values in the dataset

In [None]:
# how many null values in the dataset
print("Null values in train data:")
print(train.isnull().sum())
print('\n\n')

print("Null values in test data:")
print(test.isnull().sum())

In [None]:
sns.heatmap(train.isnull(),cmap='YlGnBu_r')

In [None]:
print(train.dtypes)

## Missing data imputation

### _Datasets may have missing values, and this can cause problems for many machine learning algorithms. As such, it is good practice to identify and replace missing values for each column in your input data prior to modeling your prediction task. This is called missing data imputation, or imputing for short._

In [None]:
#imputing the data
test=test.fillna(' ') 
train=train.fillna(' ') 

In [None]:
sns.heatmap(train.isnull(), cmap='YlGnBu_r')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(train['text'], 20)
for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['text' , 'count'])
df1.groupby('text').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 words in dataset before removing stop words',color=['slateblue', 'blueviolet', 'violet', 'orchid', 'lightpink'])

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(train['text'], 20)
for word, freq in common_words:
    print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['text' , 'count'])
df3.groupby('text').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 bigrams in dataset before removing stop words',color=['slateblue', 'blueviolet', 'violet', 'orchid', 'lightpink'])

In [None]:
train['label'].plot(
    kind='hist',
    bins=50,
    title='Number of True vs Fake News')

## Merging the columns (title, author, text) into one column

In [None]:
test['total']=test['title']+' '+test['author']+test['text']
train['total']=train['title']+' '+train['author']+train['text']

In [None]:
train.head()

In [None]:
test.head()

# Cleaning and preprocessing 

# 1. Regex

In [None]:
#Remove punctuations from the String  
sample = "!</> NLP is $$ </>^sh!!!o%%rt &&%$fo@@@r^^^&&!& </>*Natural@# Language&&\ Pro@@@##%^^&cessing!@# %%$"

In [None]:
# what is gonna get selected we r gonna replace that with the empty string(2nd parameter)
sample = re.sub(r'[^\w\s]','',sample)

In [None]:
print(sample)

# 2. Tokenization

In [None]:
import nltk 
# nltk library is v important for text preprocessing and dealing with the textual data

In [None]:
#Downloading nltk data
# nltk uses some data, punkt will hold some data related to tokenisation
# different functions and techniques of nltk uses different data
nltk.download('punkt')

In [None]:
print("The NLTK tokeniser has tokenised \"Computers are not as great at understanding words as they are numbers.\" into a list of tokens ", end="\n\n")
print(nltk.word_tokenize("Computers are not as great at understanding words as they are numbers."))

# 3. StopWords

In [None]:
sample_text = "Does this thing really work? Lets see." 

In [None]:
print("Priniting all the different sentences in sample_text: ", end="\n\n")
for i in nltk.sent_tokenize(sample_text):
    print(i)

In [None]:
words = nltk.word_tokenize(sample_text)

In [None]:
print("Priniting all the different words in sample_text: ", end="\n\n")
for i in nltk.word_tokenize(sample_text):
    print(i)

In [None]:
from nltk.corpus import stopwords
# corpus of nltk will hold the stopwords

In [None]:
stop=stopwords.words("english")
print(stop)

In [None]:
clean_words = [w for w in words if not w in stop]
#this is basically saying go through each word and add it into this new array only if it's not a part of the stopwords

In [None]:
for i in clean_words:
    print(i)

In [None]:
words = nltk.word_tokenize(sample_text.lower())

In [None]:
clean_words = [w for w in words if not w in stop]
for i in clean_words:
    print(i)

In [None]:
import string
punctuations = list(string.punctuation)

In [None]:
print(punctuations)

In [None]:
stop = stop + punctuations

In [None]:
print(stop)

In [None]:
clean_words = [w for w in words if not w in stop]
clean_words
#clean_words includes all the words in the sentence excluding the stopwords

# 4. Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
#Using WordNet lemmatizer
lemmatizer=WordNetLemmatizer()

input_str="Kites Babies Meeting Is Done Languages Cities Mice"

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize 

In [None]:
#Tokenize the sentence
input_str=nltk.word_tokenize(input_str)

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
# import these modules 
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() #need to tokenise the complee sentence


print("Below, see how kites->kite, babies->baby, languages -> language, cities -> city, mice -> mouse. Stemming couldn't have done this", end="\n\n")
#now each token i ll pass to he lemmatizer to see its reduced form
#Lemmatize each word
for word in input_str:
    print(lemmatizer.lemmatize(word).lower())

# Applying Text Preprocessing techniques discussed above on Train data

In [None]:
from nltk.stem import WordNetLemmatizer 
lemmatizer=WordNetLemmatizer()
for index,row in train.iterrows(): #taking he train data and iterating each row
    filter_sentence = ''
    
    sentence = row['total']
    sentence = re.sub(r'[^\w\s]','',sentence) 
    
    words = nltk.word_tokenize(sentence) #tokenizing the sentence
    
    words = [w for w in words if not w in stop]  #removing the stopwords
    
    #after removing the stopwords, applying the WornNet Lemmatizer
    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()
        
    # at the end, again putting the filter_sentence back into the training document at the same position    
    train.loc[index,'total'] = filter_sentence

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(train['total'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['total' , 'count'])
df2.groupby('total').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 words in dataset after text-preprocessing',color=['salmon', 'tomato', 'darksalmon', 'coral', 'orangered'])

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(train['total'], 20)
for word, freq in common_words:
    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['total' , 'count'])
df4.groupby('total').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 bi-grams in dataset after text-preprocessing', color=['darkmagenta', 'orchid', 'mediumvioletred', 'deeppink', 'hotpink', 'palevioletred'])

- **Word Counts**

Word counts also expected that the length of the table would be less influential, so only the length of the text was considered

In [None]:
def plot_word_number_histogram(textno, textye):
    
    """A function for comparing word counts"""

    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(18, 6), sharey=True)
    sns.distplot(textno.str.split().map(lambda x: len(x)), ax=axes[0], color='#e74c3c')
    sns.distplot(textye.str.split().map(lambda x: len(x)), ax=axes[1], color='#e74c3c')
    
    axes[0].set_xlabel('Word Count')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Reliable')
    axes[1].set_xlabel('Word Count')
    axes[1].set_title('Unreliable')
    
    fig.suptitle('Fake News', fontsize=24, va='baseline')
    
    fig.tight_layout()

In [None]:
plot_word_number_histogram(train[train['label'] == 0]['total'],
                           train[train['label'] == 1]['total'])

Unreliable has fewer words than Reliable.

- Word Lengths

In [None]:
def plot_word_len_histogram(textno, textye):
    
    """A function for comparing average word length"""
    
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(18, 6), sharey=True)
    sns.distplot(textno.str.split().apply(lambda x: [len(i) for i in x]).map(
        lambda x: np.mean(x)),
                 ax=axes[0], color='#e74c3c')
    sns.distplot(textye.str.split().apply(lambda x: [len(i) for i in x]).map(
        lambda x: np.mean(x)),
                 ax=axes[1], color='#e74c3c')
    
    axes[0].set_xlabel('Word Length')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Reliable')
    axes[1].set_xlabel('Word Length')
    axes[1].set_title('Unreliable')
    
    fig.suptitle('Mean Word Lengths', fontsize=24, va='baseline')
    fig.tight_layout()

In [None]:
plot_word_len_histogram(train[train['label'] == 0]['total'],
                        train[train['label'] == 1]['total'])

In [None]:
lis_text = [

    train[train['label'] == 0]['total'],
    train[train['label'] == 1]['total']
]

lis_title = [
    train[train['label'] == 0]['total'],
    train[train['label'] == 1]['total']
]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
axes = axes.flatten()

for i, j in zip(lis_text, axes):
    try:
        new = i.str.split()
        new = new.values.tolist()
        corpus = [word.lower() for i in new for word in i]
        dic = defaultdict(int)
        for word in corpus:
            if word in stop:
                dic[word] += 1
     #   print(dic)
        top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:15]
        #   print(top)
        x, y = zip(*top)
        df = pd.DataFrame([x, y]).T
        df = df.rename(columns={0: 'Stopword', 1: 'Count'})
        sns.barplot(x='Count', y='Stopword', data=df, palette='plasma', ax=j)
        plt.tight_layout()
    except:
        plt.close()
        print('No stopwords left in texts.')
        break

- Most Common Words

In [None]:
# Displaying most common words.
from collections import Counter, defaultdict

fig, axes = plt.subplots(1, 2, figsize=(18, 8))
axes = axes.flatten()

for i, j in zip(lis_text, axes):

    new = i.str.split()
    new = new.values.tolist()
    corpus = [word for i in new for word in i]

    counter = Counter(corpus)
    most = counter.most_common()
    x, y = [], []
    for word, count in most[:30]:
        if (word not in stop):
            x.append(word)
            y.append(count)


    sns.barplot(x=y, y=x, palette='plasma', ax=j)
print(x, y)
axes[0].set_title('Reliable')
axes[1].set_title('Unreliable')
axes[0].set_xlabel('Count')
axes[0].set_ylabel('Word')
axes[1].set_xlabel('Count')
axes[1].set_ylabel('Word')

fig.suptitle('Most Common Unigrams in Text', fontsize=24, va='baseline')
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
axes = axes.flatten()

for i, j in zip(lis_title, axes):

    new = i.str.split()
    new = new.values.tolist()
    corpus = [word for i in new for word in i]

    counter = Counter(corpus)
    most = counter.most_common()
    x, y = [], []
    for word, count in most[:30]:
        if (word not in stop):
            x.append(word)
            y.append(count)


    sns.barplot(x=y, y=x, palette='plasma', ax=j)
print(x, y)
axes[0].set_title('Reliable')
axes[1].set_title('Unreliable')
axes[0].set_xlabel('Count')
axes[0].set_ylabel('Word')
axes[1].set_xlabel('Count')
axes[1].set_ylabel('Word')

fig.suptitle('Most Common Unigrams in Title', fontsize=24, va='baseline')
plt.tight_layout()

- Most Common Bigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def ngrams(n, title, lis_type):
    """A Function to plot most common ngrams"""
    fig, axes = plt.subplots(1, 2, figsize=(18, 8))
    axes = axes.flatten()
    for i, j in zip(lis_type, axes):

        new = i.str.split()
        new = new.values.tolist()
        corpus = [word for i in new for word in i]

        def _get_top_ngram(corpus, n=None):
            #getting top ngrams
            vec = CountVectorizer(ngram_range=(n, n),
                                  max_df=0.9,
                                  stop_words='english').fit(corpus)
            bag_of_words = vec.transform(corpus)
            sum_words = bag_of_words.sum(axis=0)
            words_freq = [(word, sum_words[0, idx])
                          for word, idx in vec.vocabulary_.items()]
            words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
            return words_freq[:15]

        top_n_bigrams = _get_top_ngram(i, n)[:15]
        x, y = map(list, zip(*top_n_bigrams))
        sns.barplot(x=y, y=x, palette='plasma', ax=j)
        
        axes[0].set_title('Reliable')
        axes[1].set_title('Unreliable')
        axes[0].set_xlabel('Count')
        axes[0].set_ylabel('Words')
        axes[1].set_xlabel('Count')
        axes[1].set_ylabel('Words')
        fig.suptitle(title, fontsize=24, va='baseline')
        plt.tight_layout()

In [None]:
ngrams(2, 'Most Common Bigrams', lis_text)

In [None]:
ngrams(2, 'Most Common Bigrams', lis_title)

In [None]:
train.head()

In [None]:
# Obtain the total words present in the dataset
list_of_words = []
for i in train.total:
    for j in i:
        list_of_words.append(j)

In [None]:
len(list_of_words)

In [None]:
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
total_words

In [None]:
# dataframe information
train.info()
# check for null values
train.isnull().sum()

## We require only the following 2 columns now:
- 'total' --- holds the preprocessed text
- 'label' --- holds the predictions from where the machine will learn

In [None]:
train = train[['total','label']]

In [None]:
train.head()
#column 'total' has the preprocessed text

Once the data is cleaned, we can use CountVectorizer to convert our data into the format in which sklearn requires

1. CountVectorizer helps in <u>Feature Extraction</u> 
2. It convert a collection of text documents to a matrix of token counts 

CountVectorizer produces a sparse matrix.
CountVectorizer can also take care of stop words. There is an option in count vectorizer(stop_words) which takes list of stop words and can do the work for us.

### Count Vectorizer Demo

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
train_set = {"the sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a.todense()

In [None]:
count_vec.get_feature_names()

# Applying NLP Techniques

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X_train = train['total']
Y_train = train['label']

# Bag-of-words / CountVectorizer

In [None]:
# the corpus holds some sentences
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# in sklearn, we can import Bag of Words model through CountVectorizer. This works pretty much like Bag of words
# it is modified version of Bag of Words. It replaces the vector, instead of 1, w the frequency...

# we r making the object, vectorizer, of class/module CountVectorizer()
vectorizer = CountVectorizer()

In [None]:
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print("The above words are the unique words and consists of the feature set")

In [None]:
print("The below matrix will show the frequency of the features in the feature set", end="\n\n")
X.todense()

# TF-iDF Vectorizer

In [None]:
def vectorize_text(features, max_features):
    vectorizer = TfidfVectorizer( stop_words='english', #it will remove the englsh stopwords
                            decode_error='strict',
                            analyzer='word',
                            ngram_range=(1, 2), #single_words or 2words(bi-grams)
                            max_features=max_features
                            #max_df=0.5 # Verwendet im ML-Kurs unter Preprocessing                   
                            )
    feature_vec = vectorizer.fit_transform(features)
    return feature_vec.toarray()

In [None]:
tfidf_features = vectorize_text(['hello how are you doing','hi i am doing fine'],30) 
# 30 here is the number of max_features

In [None]:
print("creates some weight for all these words: ", end='\n\n')
tfidf_features

# Applying Feature Extraction using count vectorization and tfidf

The pre-processed text is in X_train and the labels in Y_train as follows (done above):

X_train = train['total']
Y_train = train['label']


Taking a CountVectorizer, whatever the output of the CountVectorizer is, applying TF-IDF transformer on top of that. Output of CountVectorizer will be some vectors with their Term Frequency. TF-IDF works better with CountVectorizer. So we are using both.

In [None]:
#Feature extraction using count vectorization and tfidf.
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [None]:
#count_vectorizer.vocabulary_.items()

In [None]:
print("10 feature names are:", end = '\n\n')
count_vectorizer.get_feature_names()[9000:9010]

In [None]:
tf_idf_matrix

In [None]:
tf_idf_matrix.toarray().shape

20800 samples and 220387 features were generated so each text sample of holding this number(220387) of length.
Now, text data has been converted into numbers using TF-IDF transformer and CountVectorizer

# Modelling

We will apply transform on the test data also because once we build the models, we need to predict for the test data and split the data into train and test

In [None]:
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
print(X_test.shape, y_test.shape)

that is, for testing we now have 5200 samples

training data - X_train.shape- 15600 -- these are those samples for which we have the labels also.
Now we will train the model on these 15600 samples and the model is able to give some predictions after that.

We have to test on new(test data) data because the model may always give correct answer to the training data possibly maybe because of OVERFITTING

Now these 5200 X_test samples, we know the actual output (or, label) for these samples
And, omce our model is trained, we will test it with these 5200 X_test samples and we will get some predictions. 
Now we compare the prediction with the actual labels, then we know that how our model has performed

### PassiveAggressiveClassifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier(max_iter=10000, random_state=1,tol=1e-3).fit(X_train, y_train)

In [None]:
y_pred_pa = model.predict(X_test)
print(model.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred_pa))

In [None]:
cm = confusion_matrix(y_test, y_pred_pa)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="Blues", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Passive Aggressive Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('PassiveAgressive.png')

# Multi Layer Perceptron	

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

In [None]:
clf.predict_proba(X_test[:1])

In [None]:
pred_ann= clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
print(classification_report(y_test,pred_ann))

In [None]:
cm = confusion_matrix(y_test, pred_ann)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="PiYG", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Multi Layer Perceptron on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('MLP.png')

In [None]:
print(pred_ann)

# Logistic Regression

Lear abt Logistic Regression at 1 hr, 22 mins
(C=1e5) is a regularisation parameter. Regularisation is used to avoid the overfitting when we have a lot of features, like here for each sample(15600), we have (220387) features. A regularisation parameter should be as low as possible so that we can remove the overfitting, these things can be fine tuned again.

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5, random_state=110, max_iter=300)

In [None]:
logreg.fit(X_train, y_train)

In [None]:
pred = logreg.predict(X_test)

print('Accuracy of Logistic Regression on test set: {:.5f}'
     .format(logreg.score(X_test, y_test)))

In [None]:
print(pred)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred))

In [None]:
cm = confusion_matrix(y_test, pred)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="coolwarm", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Logistic Regression on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('Logistic.png')

# MultinomialNB

Multinomial Naive Bayes Classifier works on Conditional Probabilities

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(X_train, y_train)
pred_NB = NB.predict(X_test)
print('Accuracy of MultinomialNB classifier on test set: {:.2f}'
     .format(NB.score(X_test, y_test)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred_NB))

In [None]:
cm = confusion_matrix(y_test, pred_NB)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="bwr", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Multinomial Naive Bayes Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('MultinomialNB.png')

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
pred_dt = DT.predict(X_test)
DT.score(X_test, y_test)

In [None]:
print(classification_report(y_test,pred_dt))

In [None]:
cm = confusion_matrix(y_test, pred_dt)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="seismic", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Decision Tree Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('DT.png')

# GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(X_train, y_train)
pred_gbc = GBC.predict(X_test)
GBC.score(X_test, y_test)

In [None]:
print(classification_report(y_test, pred_gbc))
print(GBC.score(X_test, y_test))

In [None]:
cm = confusion_matrix(y_test, pred_gbc)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="RdGy", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Gradient Boosting Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('GradientBoostingClassifier.png')

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators=50, criterion="entropy")
RFC.fit(X_train, y_train)
pred_RFC = RFC.predict(X_test)

In [None]:
print(RFC.score(X_test, y_test))
print(classification_report(y_test, pred_RFC))

In [None]:
cm = confusion_matrix(y_test, pred_RFC)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="PuOr", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for Random Forest Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('RandomForestClassifier.png')

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)

In [None]:
print(knn.score(X_test, y_test))
print(classification_report(y_test, pred_knn))

In [None]:
cm = confusion_matrix(y_test, pred_knn)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="Pastel1", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for K-Nearest-Neighbours Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('K-Nearest-Neighbours1.png')

In [None]:
neighbors = np.arange(1, 15)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

In [None]:
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
plt.title('k-NN: Analysis of varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)

In [None]:
print(knn.score(X_test, y_test))
print(classification_report(y_test, pred_knn))

In [None]:
cm = confusion_matrix(y_test, pred_knn)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="Pastel2", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for K-Nearest-Neighbours Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('K-Nearest-Neighbours2.png')

# SVM -Linear Kernel

In [None]:
from sklearn import svm, metrics

svm_ = svm.SVC(kernel="linear")
svm_.fit(X_train, y_train)
pred_svm = svm_.predict(X_test)

In [None]:
print(svm_.score(X_test, y_test))
print(classification_report(y_test, pred_svm))

In [None]:
cm = confusion_matrix(y_test, pred_svm)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="Paired", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for SVM Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('SVM.png')

# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

clf = AdaBoostClassifier(n_estimators=100, random_state=0)

In [None]:
clf.fit(X_train, y_train)

In [None]:
pred_adaBoost = clf.predict(X_test)
print(clf.score(X_test, y_test))

In [None]:
print(classification_report(y_test,pred_adaBoost))

In [None]:
cm = confusion_matrix(y_test, pred_adaBoost)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="rainbow", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for AdaBoost Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('AdaBoost.png')

# XGBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)

In [None]:
pred_XGBoost = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,pred_XGBoost))

In [None]:
cm = confusion_matrix(y_test, pred_XGBoost)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="gist_ncar", fmt="g");  

# Labels, title and ticks
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font);
ax.set_ylabel('True labels', fontdict=label_font);

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix for XGBoost Classifier on Fake News Dataset', fontdict=title_font);

ax.tick_params(axis='both', which='major', labelsize=10)  # Adjust to fit
ax.xaxis.set_ticklabels(['Real', 'Fake']);
ax.yaxis.set_ticklabels(['Real', 'Fake']);
plt.show()
plt.savefig('XGBoost.png')