In [93]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [94]:
import nltk 
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [96]:
#Loading the Dataset

news_dataset = pd.read_csv('./Indian_Fake_News/news_dataset.csv')
news_dataset.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [97]:
news_dataset.isnull().sum()
news_dataset = news_dataset.fillna(' ')

In [98]:
for i in range(0,len(news_dataset)):
    if news_dataset['label'][i] == 'REAL':
        news_dataset['label'][i] = 1
    
    elif news_dataset['label'][i] == 'FAKE':
        news_dataset['label'][i] = 0

news_dataset.head()

Unnamed: 0,label,text
0,1,Payal has accused filmmaker Anurag Kashyap of ...
1,0,A four-minute-long video of a woman criticisin...
2,0,"Republic Poll, a fake Twitter account imitatin..."
3,1,"Delhi teen finds place on UN green list, turns..."
4,1,Delhi: A high-level meeting underway at reside...


In [99]:
import re

port_stem = PorterStemmer()


def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()

    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

In [100]:
news_dataset['text'] = news_dataset['text'].apply(stemming)

In [101]:
X = news_dataset['text']
Y = news_dataset['label']

vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)


from sklearn import preprocessing
from sklearn import utils

#convert y values to categorical values
lab = preprocessing.LabelEncoder()
Y = lab.fit_transform(Y)

#view transformed values
print(Y)


[1 0 0 ... 0 1 1]


In [102]:
X_train , X_test , Y_train , Y_test = train_test_split(X ,Y , test_size=0.2, stratify=Y, random_state=2)

print(X_train)
print(Y_train)

  (0, 25661)	0.1573839003857635
  (0, 25367)	0.12414563730395412
  (0, 25047)	0.06207281865197706
  (0, 23313)	0.07919403827475545
  (0, 23282)	0.06512601665509031
  (0, 23117)	0.03507185142641947
  (0, 23029)	0.05354626001350953
  (0, 22692)	0.05574601268970623
  (0, 21769)	0.10363890590719183
  (0, 21543)	0.06299523353333619
  (0, 21078)	0.057009394820966786
  (0, 21053)	0.10625290562882383
  (0, 20879)	0.14751539450999962
  (0, 20196)	0.07850508531142532
  (0, 20131)	0.042545057880162
  (0, 20041)	0.10625290562882383
  (0, 19211)	0.058250080983829056
  (0, 19133)	0.09554407897049338
  (0, 18704)	0.08562185124612166
  (0, 17851)	0.10408400046986317
  (0, 17246)	0.11998725343655409
  (0, 17065)	0.054204698777292115
  (0, 16961)	0.05983924631402946
  (0, 16907)	0.08604965083243618
  (0, 16729)	0.13105899213536576
  :	:
  (2981, 3499)	0.05312723713342368
  (2981, 3276)	0.08183805846244145
  (2981, 2803)	0.5070750157386559
  (2981, 1892)	0.026508213530639976
  (2981, 1558)	0.113675188170

In [103]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [104]:
model.fit(X_train, Y_train)

In [115]:
Y_pred = model.predict(X_test)
print(classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       379
           1       0.99      1.00      0.99       367

    accuracy                           0.99       746
   macro avg       0.99      0.99      0.99       746
weighted avg       0.99      0.99      0.99       746



In [139]:
def process_data(content:str):
    a = stemming(content)
    print(a)
    ml_inp = vectorizer.transform([a])
    return ml_inp

In [140]:
ml_inp = process_data('IRCTC denies banning non-veg food during shravan month')

irctc deni ban non veg food shravan month


  (0, 25661)	0.1573839003857635
  (0, 25367)	0.12414563730395412
  (0, 25047)	0.06207281865197706
  (0, 23313)	0.07919403827475545
  (0, 23282)	0.06512601665509031
  (0, 23117)	0.03507185142641947
  (0, 23029)	0.05354626001350953
  (0, 22692)	0.05574601268970623
  (0, 21769)	0.10363890590719183
  (0, 21543)	0.06299523353333619
  (0, 21078)	0.057009394820966786
  (0, 21053)	0.10625290562882383
  (0, 20879)	0.14751539450999962
  (0, 20196)	0.07850508531142532
  (0, 20131)	0.042545057880162
  (0, 20041)	0.10625290562882383
  (0, 19211)	0.058250080983829056
  (0, 19133)	0.09554407897049338
  (0, 18704)	0.08562185124612166
  (0, 17851)	0.10408400046986317
  (0, 17246)	0.11998725343655409
  (0, 17065)	0.054204698777292115
  (0, 16961)	0.05983924631402946
  (0, 16907)	0.08604965083243618
  (0, 16729)	0.13105899213536576
  :	:
  (0, 7974)	0.09977792247525379
  (0, 7450)	0.14976495026743958
  (0, 7311)	0.0953206100144284
  (0, 7062)	0.09204498528848253
  (0, 7059)	0.09869906751150803
  (0, 6855

ValueError: Expected 2D array, got 1D array instead:
array=[<1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.