#### Import libraries

In [2]:
import numpy as np
import pandas as pd
import re# regular expression---> used for searching a text in a document
from nltk.corpus import stopwords#corpus----> body of the text;nltk--->natural language toolkit;stopwords---> words that doesnt add much value to the text
from nltk.stem.porter import PorterStemmer#stemming removes the prefix and suffix of a word and returns the root word of it.
from sklearn.feature_extraction.text import TfidfVectorizer#TfidVectorization--->converts the text into feature vector(number)
from sklearn.model_selection import train_test_split# helps in splitting the data into training and testing data
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Printing the stop words in English

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Pre-Processing

In [5]:
#Loading the dataset to a pandas DataFrame
news_dataset=pd.read_csv('/kaggle/input/fake-news/train.csv')

In [6]:
news_dataset.shape

(20800, 5)

### we have 20,800 news article and 5 features

In [7]:
# print 5rows
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


#### Checking the missing values

In [8]:
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

### Replacing the missing values with empty string

In [9]:
news_dataset=news_dataset.fillna('')

### Merging the author name and news title

In [10]:
# + concatenates both the columns
news_dataset['content']=news_dataset['author']+'  '+news_dataset['title']
print(news_dataset['content'])

0        Darrell Lucus  House Dem Aide: We Didn’t Even ...
1        Daniel J. Flynn  FLYNN: Hillary Clinton, Big W...
2        Consortiumnews.com  Why the Truth Might Get Yo...
3        Jessica Purkiss  15 Civilians Killed In Single...
4        Howard Portnoy  Iranian woman jailed for ficti...
                               ...                        
20795    Jerome Hudson  Rapper T.I.: Trump a ’Poster Ch...
20796    Benjamin Hoffman  N.F.L. Playoffs: Schedule, M...
20797    Michael J. de la Merced and Rachel Abrams  Mac...
20798    Alex Ansary  NATO, Russia To Hold Parallel Exe...
20799             David Swanson  What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [11]:
#separate the data(content column) and the label
X=news_dataset.drop(columns='label',axis=1)
y=news_dataset['label']
print(X)
print(y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

### Stemming:
#### Stemming is a process of reducing a word to its Root word
Eg: actor,acting,actress----> root word is "act"

In [12]:
 port_stem=PorterStemmer()
    

In [13]:
def stemming(content):
    stemmed_content= re.sub('[^a-zA-z]', ' ',content)#^---->excludes all the numbers and punctuations and prints onlt alphabets from the content column; ' '--> if there are any numbers they are denoted by space
    stemmed_content=stemmed_content.lower()#converts every text to lower
    stemmed_content=stemmed_content.split()#all the words and text are converted as list
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]# performs stemming function for all words except stopwords
    stemmed_content=' '.join(stemmed_content)#join all the words in the stem content
    return stemmed_content
    

In [14]:
news_dataset['content']=news_dataset['content'].apply(stemming)

In [15]:
print(news_dataset['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


### There is no uppercase letter,quotations,stopwords

### Separating Data and label

In [16]:
X=news_dataset['content'].values# previous x values are hided
y=news_dataset['label'].values

In [17]:
print(X)
print(y)


['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
[1 0 1 ... 0 1 1]


In [18]:
y.shape

(20800,)

### Converting textual data into numerical data

In [19]:
vectorizer=TfidfVectorizer()#tf-->term frequency,Repeated words says that it is an important word and it assaigns a numerical value to that word.;idf---> inverse document frequency====> counts the number of times an insignificant word is repeated in the documnet. 
vectorizer.fit(X)#fitting with X
X=vectorizer.transform(X)
print(X)

  (0, 15697)	0.28485063562728646
  (0, 13480)	0.2565896679337957
  (0, 8915)	0.3635963806326075
  (0, 8636)	0.29212514087043684
  (0, 7698)	0.24785219520671603
  (0, 7012)	0.21874169089359144
  (0, 4979)	0.233316966909351
  (0, 3795)	0.2705332480845492
  (0, 3603)	0.3598939188262559
  (0, 2962)	0.2468450128533713
  (0, 2485)	0.3676519686797209
  (0, 268)	0.27010124977708766
  (1, 16812)	0.30071745655510157
  (1, 6823)	0.1904660198296849
  (1, 5510)	0.7143299355715573
  (1, 3571)	0.26373768806048464
  (1, 2816)	0.19094574062359204
  (1, 2224)	0.3827320386859759
  (1, 1895)	0.15521974226349364
  (1, 1498)	0.2939891562094648
  (2, 15622)	0.41544962664721613
  (2, 9625)	0.49351492943649944
  (2, 5975)	0.3474613386728292
  (2, 5396)	0.3866530551182615
  (2, 3106)	0.46097489583229645
  :	:
  (20797, 13128)	0.2482526352197606
  (20797, 12350)	0.27263457663336677
  (20797, 12144)	0.24778257724396507
  (20797, 10311)	0.08038079000566466
  (20797, 9593)	0.174553480255222
  (20797, 9523)	0.295420

#### Splitting to Training and Text data

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)# test size=20%; labels of X_train is in y_train and labels of y_train is in y_test;


#### Training the model

In [21]:
model=LogisticRegression()


In [22]:
model.fit(X_train,y_train)

LogisticRegression()

### Evaluation

In [23]:
# Accuracy score on the training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,y_train)# X_train_prediction--> value predicted by our model; y_train-->original label;both of them will be compared and accuracy score will be given


In [24]:
print(" accuracy score of Training data :", training_data_accuracy)

 accuracy score of Training data : 0.9864783653846154


### Accuracy on Test data

In [25]:
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,y_test)

In [26]:
print("Accuracy score of Testing data :", test_data_accuracy)

Accuracy score of Testing data : 0.9790865384615385


### Making Prediction

In [27]:
X_new=X_test[0]
prediction=model.predict(X_new)
print(prediction)
if (prediction[0]==0):
    print("The News is Real")
else:
    print("The News is Fake")

[1]
The News is Fake


In [28]:
# checking with the original
print(y_test[0])

1
