**Fake News Classifier**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Dataset: https://www.kaggle.com/c/fake-news/data#

**Load the dataset**

In [2]:
df=pd.read_csv('../input/fake-news/train.csv')

In [3]:
df.head()

In [4]:
X=df.drop('label',axis=1)
X.head()

In [5]:
y=df['label']
y.head()

In [6]:
df.shape

In [7]:
df.isnull().sum()

In [8]:
df=df.dropna()

In [9]:
df.shape

In [10]:
df.head(10)

In [11]:
data=df.copy()

In [12]:
# Reset the index
data.reset_index(inplace=True)
data.head(10)

In [13]:
data['title'][8]

In [14]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus=[]

for i in range(0,len(data)):
    review = re.sub('[^a-zA-Z]',' ',data['title'][i])
    review = review.lower()
    review = review.split()            
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [17]:
corpus[3]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()

In [19]:
X.shape

In [22]:
y=data['label']
y

In [23]:
# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=0)

In [26]:
cv.get_feature_names_out()[:20]

In [27]:
cv.get_params()

In [28]:
count_data=pd.DataFrame(X_train, columns=cv.get_feature_names_out())
count_data

In [34]:
from sklearn import metrics 
import numpy as np
import itertools
import matplotlib.pyplot as plt

In [31]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

**MultinomialNB Algorithm**

In [32]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [35]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

**Multinomial Classifier with Hyperparameter**

In [36]:
classifier=MultinomialNB(alpha=0.1)

In [37]:
previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

In [40]:
# Get Features names
feature_names = cv.get_feature_names_out()

In [41]:
classifier.coef_[0]

In [42]:
# The most real
sorted(zip(classifier.coef_[0],feature_names),reverse=True)[:20]

In [43]:
# The most fake
sorted(zip(classifier.coef_[0], feature_names))[:20]