In [1]:
import pandas as pd
import numpy as np

# **Loading the training dataset**

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


In [4]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


# **Missing Values in dataset**

In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

# **Filling missing values with empty string**

In [7]:
df = df.fillna('')

In [8]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

# **Feature Selection**

In [9]:
df = df.drop(columns=['id','title','author'],axis=1)
df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \r\nAn Iranian woman has been sentenced ...,1


# **Applying NLP**

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re  #for regex check

In [11]:
port_stem = PorterStemmer()

In [42]:
port_stem.stem("Hi ThIS$%56 is &%$RishABH")

'hi this$%56 is &%$rishabh'

In [43]:
x = re.sub('[^a-zA-Z]',' ',"hi this$%56 is &%$rishabh")
x.split()

['hi', 'this', 'is', 'rishabh']

In [41]:
def stemming(text):
    retext = re.sub('[^a-zA-Z]',' ',text)
    retext=retext.lower()
    retext=retext.split()
    retext=[port_stem.stem(word) for word in retext if not word in stopwords.words('english')]
    retext=''.join(retext)
    return retext

In [15]:
stemming("Hi ThIS$%56 is &%$RishABH")

'hirishabh'

In [16]:
df['text'] = df['text'].apply(stemming)

In [17]:
df.head()

Unnamed: 0,text,label
0,housdemaidevenseecomeyletterjasonchaffetztweet...,1
1,evergetfeellifecirclroundaboutratherheadstraig...,0
2,truthmightgetfireoctobtensionintelliganalystpo...,1
3,videociviliankillsinglusairstrikidentifirateci...,1
4,printiranianwomansentencsixyearprisoniranrevol...,1


# **Dividing input and output datasets**

In [18]:
X = df['text']
y = df['label']

In [19]:
X.shape

(20800,)

In [20]:
y.shape

(20800,)

# **Spliting into training and testing data**

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=2)

In [22]:
X_test.head()

15920    postoriginpublishsitesouthfrontorgdonatsyriana...
2420     johnwhiteheadrutherfordinstitutpeoplpowerawake...
3589     liketrampolinsayhedgehoghedgehogconfirmliketra...
2490     postoctobseanadltabatabainewsuscommentattorney...
13482    headliwesterfieldfrioctthpmmegynkelligarnerhea...
Name: text, dtype: object

# **Feature Extraction from text using TfidfVectorizer**

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()

In [24]:
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

In [25]:
X_train.shape

(15600, 15227)

In [26]:
X_test.shape

(5200, 15227)

# **Training DecisionTreeClassifier Model**

In [27]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [28]:
dt.fit(X_train,y_train)

In [29]:
y_pred = dt.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [30]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5067307692307692

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2565
           1       0.51      1.00      0.67      2635

    accuracy                           0.51      5200
   macro avg       0.25      0.50      0.34      5200
weighted avg       0.26      0.51      0.34      5200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Converting our model and vectorizer in .pkl format files to use our model to predict on a live website**

In [32]:
import pickle

**Open file for write only mode in binary format**

In [33]:
pickle.dump(vec, open('vector.pkl', 'wb'))
pickle.dump(dt, open('model.pkl', 'wb'))

**Open file for read only mode in binary format**

In [34]:
vector_form = pickle.load(open('vector.pkl','rb'))
model_form = pickle.load(open('model.pkl','rb'))

# **Function to check whether the news is Fake or Real**

In [35]:
def check_news(news):
    news = stemming(news)
    input_data = [news]
    vector = vector_form.transform(input_data)
    pred = model_form.predict(vector)
    return pred

In [36]:
val = check_news("""I am a star""")

In [37]:
if(val==[0]):
    print('Real News')
else:
    print('Fake News')

Fake News
