In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv("fake_news.csv")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
data.shape

(20800, 5)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
data=data.drop(['id'],axis=1)

In [7]:
#fill missing values with empty string
data=data.fillna('')

In [8]:
data['content']=data['author']+''+data['title']+''+data['text']

In [9]:
data=data.drop(['title','author','text'],axis=1)

In [10]:
data.head()

Unnamed: 0,label,content
0,1,Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1,0,"Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom..."
2,1,Consortiumnews.comWhy the Truth Might Get You ...
3,1,Jessica Purkiss15 Civilians Killed In Single U...
4,1,Howard PortnoyIranian woman jailed for fiction...


# Data Preprocessing 

In [11]:
#convert to lowercase
data['content']=data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [12]:
#remove punctuation
data['content']=data['content'].str.replace('[^\w\s]','')

  data['content']=data['content'].str.replace('[^\w\s]','')


In [13]:
#remove stop words
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91810\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords
stop=stopwords.words('english')
data['content']=data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [15]:
#do lemmatization
from nltk.stem import WordNetLemmatizer
from textblob import Word
data['content']=data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data['content'].head()

0    darrell lucushouse dem aide didnt even see com...
1    daniel j flynnflynn hillary clinton big woman ...
2    consortiumnewscomwhy truth might get firedwhy ...
3    jessica purkiss15 civilian killed single u air...
4    howard portnoyiranian woman jailed fictional u...
Name: content, dtype: object

In [16]:
#separating the data and label
x=data[['content']]
y=data['label']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
#splitting into training and testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=45,stratify=y)

In [19]:
#validate the shape of train and testing data
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(14560, 1)
(14560,)
(6240, 1)
(6240,)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidf_vect=TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=5000)
tfidf_vect.fit(data['content'])
xtrain_tfidf=tfidf_vect.transform(x_train['content'])
xtest_tfidf=tfidf_vect.transform(x_test['content'])