#Step 01 - Import Modules

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Step 02 - Getting Data

In [None]:
! mkdir ~/.kaggle
! cp /content/kaggle.json ~/.kaggle


In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c fake-news

In [None]:
# ! unzip /content/fake-news.zip

# Step 03 - Data Preprocessing

In [None]:
data = pd.read_csv('./train.csv')

In [None]:
data.info()

In [None]:
data.head()

In [None]:
# Number of Missing Value in Data sets
# print(data.isna().any())
data.isnull().sum()

In [None]:
# replace the Null value to empty value
news_data = data.fillna('')

In [None]:
# margine title and author column into one
news_data['content'] = news_data['author']+'  '+ news_data['title']

In [None]:
news_data['content'][0]

In [None]:
# Stemming the Words
p_stem = PorterStemmer()

In [None]:
def clean_msg(content):
  words = []
  text = word_tokenize(content)
  stopword = set(stopwords.words('english'))

  for w in text:
    w = w.lower()
    if w.isalpha():
      if w not in stopword:
        text = p_stem.stem(w)
        words.append(text)

  return ' '.join(words)

In [None]:
%%time
content = news_data['content'].apply(clean_msg)

In [None]:
x = content.values
y = news_data['label']

In [None]:
# converting the text data into numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)

In [None]:
# spliting data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Creating Model & Training

## Multinomial naive Based Classifier

In [None]:
model = MultinomialNB()

In [None]:
model.fit(x_train, y_train)

In [None]:
# prediction

pred = model.predict(x_test)

In [None]:
pred[2], y_test.iloc[2]

In [None]:
model.score(x_test, y_test)

## Logistic Regression Model

In [None]:
model_2 = LogisticRegression()

model_2.fit(x_train, y_train)

In [None]:
pred_2 = model.predict(x_test)

**Evaluation The Model**

In [None]:
print(accuracy_score(y_test, pred_2))

In [None]:
print(confusion_matrix(y_test, pred_2))