#### Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Download Stopwords from nltk library

In [2]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Print StopWords

In [3]:
print(stopwords.words("English"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### Data Preprocessing

In [4]:
# import dataset 
df = pd.read_csv("train.csv")
df.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


#### Check Shape of the Dataset

In [5]:
df.shape

(20800, 5)

#### Check information about the Dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


#### Check is there is any null value into the Dataset

In [7]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

#### Fillup Null values with empty string

In [8]:
# filling the None values with empty string:
df = df.fillna("")

# again checking is there is any null values into the Dataset
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [9]:
# Combining the title and author columns by space 
df["content"] = df["title"]+" "+df["author"]

df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [10]:
# checking the first valuse of the content column
df["content"][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It Darrell Lucus'

In [11]:
# saperate the label data to the other data
X = df.drop(columns="label", axis=1)

y = df["label"]

In [12]:
# check the X and y
print(X)
print(y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [13]:
# create a port stremming function
# Initialize stemmer
port_stem = PorterStemmer()

def stemming(content):
    # Replace non-alphabetic characters with a space
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()

    # Stem the words and remove stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]

    # Join the list back into a string
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [14]:
df["content"]  = df["content"].apply(stemming)

In [15]:
print(df["content"])

0        hous dem aid even see comey letter jason chaff...
1        flynn hillari clinton big woman campu breitbar...
2                   truth might get fire consortiumnew com
3        civilian kill singl us airstrik identifi jessi...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
20795    rapper trump poster child white supremaci jero...
20796    n f l playoff schedul matchup odd new york tim...
20797    maci said receiv takeov approach hudson bay ne...
20798    nato russia hold parallel exercis balkan alex ...
20799                            keep f aliv david swanson
Name: content, Length: 20800, dtype: object


In [16]:
# now we using two columns one is content column and other is label column
X = df["content"].values
y = df["label"].values

In [17]:
print(X)
print(y)

['hous dem aid even see comey letter jason chaffetz tweet darrel lucu'
 'flynn hillari clinton big woman campu breitbart daniel j flynn'
 'truth might get fire consortiumnew com' ...
 'maci said receiv takeov approach hudson bay new york time michael j de la merc rachel abram'
 'nato russia hold parallel exercis balkan alex ansari'
 'keep f aliv david swanson']
[1 0 1 ... 0 1 1]


In [18]:
# we should convert text into vectorized numarical values 
vectorizer  = TfidfVectorizer()
vectorizer.fit(X)


X = vectorizer.transform(X)

In [19]:
print(X)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

In [22]:
# now we are doing train test split funtion that saperates the dataset into test and training data for model training

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2, random_state=2,stratify=y)


In [23]:
print(X_train)

  (0, 15295)	0.1007807967397786
  (0, 13412)	0.37902255352286907
  (0, 9985)	0.32500432301444965
  (0, 9512)	0.3414279025447053
  (0, 8123)	0.3452715339306703
  (0, 7093)	0.3557007823665418
  (0, 6341)	0.3669721768972648
  (0, 3568)	0.24628193969421852
  (0, 1204)	0.42769786200081683
  (1, 8863)	0.6926972991877945
  (1, 8062)	0.72122843239707
  (2, 14782)	0.21043716954333871
  (2, 14080)	0.3506791622984718
  (2, 14032)	0.238582401148627
  (2, 13270)	0.17881546559647304
  (2, 11853)	0.30889618657046
  (2, 10412)	0.3185945150384629
  (2, 8662)	0.2536193833283985
  (2, 8560)	0.25114575050500304
  (2, 7719)	0.3781036956834786
  (2, 7596)	0.235505107871417
  (2, 4792)	0.3055971733418447
  (2, 4466)	0.2690286496399378
  (2, 1691)	0.2352368679401513
  (3, 16197)	0.359521591733607
  :	:
  (4157, 13520)	0.5952603406476158
  (4157, 12576)	0.19200525616832312
  (4157, 12099)	0.2452016139164209
  (4157, 10864)	0.29364969672411856
  (4157, 7138)	0.2213313364684617
  (4157, 6505)	0.2407174264764414


In [24]:
# implementing model on the taining data
lg = LogisticRegression()
lg.fit(X_train, y_train)


In [27]:
# Evaluation of the model by prediction of training data
X_Training_data = lg.predict(X_train)
Trainng_data_accrucy = accuracy_score(X_Training_data, y_train)

In [28]:
print("The Accrucy Score of the training data:", Trainng_data_accrucy)

The Accrucy Score of the training data: 0.9776442307692308


In [30]:
# Evaluation of the model by prediction of testing data
X_Testing_data = lg.predict(X_test)
Testing_data_accrucy = accuracy_score(X_Testing_data, y_test)

In [31]:
print("The Accrucy Score of the Testing data:", Testing_data_accrucy)

The Accrucy Score of the Testing data: 0.9513822115384616


In [32]:
X_new = X_test[0]
prediction  = lg.predict(X_new)


if (prediction[0] == 0):
    print("This News is Real.")
else:
    print("This News is Fake.")

This News is Fake.


In [36]:
print("What my model predict:",prediction[0],"\nWhat is the actual Answer is:",y_test[0])

What my model predict: 1 
What is the actual Answer is: 1


In [None]:
# now we can export our model by using pickle library
import pickle
with open("Model.pkl", "wb") as file:
    pickle.dump()