### Importing necessary libraries

In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading the dataset into a dataframe ###

In [66]:
news_dataset = pd.read_csv('C:\\Users\\samri\\OneDrive\\Desktop\\news.csv') 

### First few lines of the dataset

In [67]:
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Last lines of the dataset

In [68]:
news_dataset.tail()


Unnamed: 0.1,Unnamed: 0,title,text,label
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


### x contains "text" of the news articles and y contains labels

In [70]:
x = news_dataset['text']
y = news_dataset['label']

### Splitting the dataset into training and testing sets 30% for testing, 70% for training

In [71]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=53)

### Creating TF-IDF object and setting max document frequency to 70%

In [76]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english' , max_df = 0.7)


### Fitting the vectorizer to the training data and storing TF-IDF feature vectors for the training set
 

In [77]:
tfidf_train  = tfidf_vectorizer.fit_transform(x_train)


### Transforming the testing set of text data into TF-IDF feature vectors

In [78]:
tfidf_test = tfidf_vectorizer.transform(x_test)


### Creating a dataframe where each row represents a document and each column corresponds to a unique word in the vocabulary, with values indicating the TF-IDF scores.

In [82]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names_out())


### Importing Naive Bayes, classification report and pipeline

In [88]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
from sklearn.pipeline import Pipeline


### Creating MultinomialNB object

In [89]:
nb_classifier = MultinomialNB()


### Training Naive Bayes classifier using TF-IDF feature vectors and target labels

In [98]:
nb_classifier.fit(tfidf_train, y_train)


### Creating a pipeline of 2 steps, TF-IDF vectorization with removal of stopwords and Multinomial Naive Bayes classifier

In [105]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('nbmodel', MultinomialNB())])


### Fitting the pipeline to the training data, after this, pipeline is ready to make predictions

In [106]:
pipeline.fit(x_train, y_train)


### Predictions on the training data

In [110]:
pred = pipeline.predict(x_test)

In [111]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.97      0.74      0.84       913
        REAL       0.80      0.98      0.88       988

    accuracy                           0.86      1901
   macro avg       0.88      0.86      0.86      1901
weighted avg       0.88      0.86      0.86      1901



### Creating a numpy array (0.1 - 0.9)

In [113]:
alphas = np.arange(0,1,0.1)

### Training Naive Bayes classifier 

In [114]:
def train_and_predict(alpha):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train,y_train)
    pred  = nb_classifier.predict(tfidf_test)
    score = accuracy_score(y_test,pred)
    return score

### Accuracy for different values of alpha

In [115]:
for alpha in alphas:
    print('Alpha: ',alpha)
    print('Score: ',train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.8858495528669121

Alpha:  0.1
Score:  0.9042609153077328

Alpha:  0.2
Score:  0.9011046817464492

Alpha:  0.30000000000000004
Score:  0.8953182535507628

Alpha:  0.4
Score:  0.8921620199894792

Alpha:  0.5
Score:  0.8884797475013151

Alpha:  0.6000000000000001
Score:  0.8826933193056287

Alpha:  0.7000000000000001
Score:  0.875854813256181

Alpha:  0.8
Score:  0.8695423461336139

Alpha:  0.9
Score:  0.8679642293529721





In [123]:
import pickle

### Saving the model

In [124]:
with open('model.pkl', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)