####Importing libraries

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

####Data Pre-processing

###About the dataset:
1.   id: unique id for a news article
2.   title: the title of a news article
1.   author: author of the news article
2.   text: the content of the article
1.   label: 1 for fake, 0 for real news








In [5]:
df = pd.read_csv('/content/drive/MyDrive/datasets/fake_news_train.csv')

In [6]:
df.shape

(20800, 5)

In [7]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
# counting the missing values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
# dropping rows where text is null
df = df[df['text'].notna()]

In [10]:
df.isnull().sum()

id           0
title      558
author    1918
text         0
label        0
dtype: int64

In [11]:
# replacing missing values with empty string
df.fillna('', inplace=True)

In [12]:
df.groupby('label').count()

Unnamed: 0_level_0,id,title,author,text
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10387,10387,10387,10387
1,10374,10374,10374,10374


In [14]:
# merging the columns author, title and text
df['content'] = df['author'] + ' ' + df['title'] + ' ' + df['text']

In [15]:
df.drop(['author', 'title', 'text'], axis = 1, inplace=True)

In [16]:
df.head()

Unnamed: 0,id,label,content
0,0,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,1,Consortiumnews.com Why the Truth Might Get You...
3,3,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,1,Howard Portnoy Iranian woman jailed for fictio...


####Stemming:

It's the process of reducing an word to it's root word

In [20]:
port_stem = PorterStemmer()

In [21]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [22]:
df['content'] = df['content'].apply(stemming)

In [23]:
df.head()

Unnamed: 0,id,label,content
0,0,1,darrel lucu hous dem aid even see comey letter...
1,1,0,daniel j flynn flynn hillari clinton big woman...
2,2,1,consortiumnew com truth might get fire truth m...
3,3,1,jessica purkiss civilian kill singl us airstri...
4,4,1,howard portnoy iranian woman jail fiction unpu...


In [24]:
# separating data and label
df_x = df['content'].values
df_y = df['label'].values

In [28]:
print(f'content {df_x.shape}\n', df_x)
print(f'\n\nlabels {df_y.shape}\n', df_y)

content (20761,)
 ['darrel lucu hous dem aid even see comey letter jason chaffetz tweet hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review 

####Converting the textual data to numeric data

In [29]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df_x)
df_x = vectorizer.transform(df_x)

In [31]:
print(df_x)

  (0, 109709)	0.04916651639419508
  (0, 109654)	0.0190500930646299
  (0, 108699)	0.044160677041691444
  (0, 108695)	0.09476793161968479
  (0, 108652)	0.037566527939016235
  (0, 108615)	0.011299552224499044
  (0, 107964)	0.017088213540562627
  (0, 107147)	0.017101608766394646
  (0, 107056)	0.012537122382585187
  (0, 106970)	0.029126788979233546
  (0, 106891)	0.012857332989318316
  (0, 106691)	0.011765302846626823
  (0, 105842)	0.025726240772580792
  (0, 105806)	0.03128690578523631
  (0, 104796)	0.021533899899692427
  (0, 103381)	0.0654491099495065
  (0, 102695)	0.033151133173839296
  (0, 102444)	0.01639152275149079
  (0, 101676)	0.03807579437920063
  (0, 101036)	0.011075720440438485
  (0, 101026)	0.04319193648243406
  (0, 100973)	0.13600836947921974
  (0, 100825)	0.07128203423915312
  (0, 99536)	0.0394542973241227
  (0, 98968)	0.027119946472978686
  :	:
  (20760, 7467)	0.010628533694759731
  (20760, 7140)	0.02815504242188914
  (20760, 6845)	0.03957302315880808
  (20760, 6807)	0.02536251

```
Note that you are printing a sparse matrix so the output looks different compared to printing a standard dense matrix. See below the main components:

1.   The tuple represents: (document_id, token_id)
2.   The value following the tuple represents the tf-idf score of a given token in a given document
1.   The tuples that are not there have a tf-idf score of 0


If you want to find what token the token_id corresponds to, check the get_feature_names method.

From Stackoverflow <<https://stackoverflow.com/questions/50906210/confused-with-the-return-result-of-tfidfvectorizer-fit-transform>>
```

In [32]:
x_train, x_test, y_train, y_test = train_test_split(df_x,df_y, test_size=0.2, stratify = df_y)

###Training the model

####Logistic Regression

In [33]:
model_logistic_regression = LogisticRegression()

In [34]:
model_logistic_regression.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

####Evaluating the Logistic Regression model

In [35]:
# accuracy score
pred = model_logistic_regression.predict(x_test)
accuracy = accuracy_score(pred, y_test)
print(accuracy)

0.9568986274981941


####Building a predictive system

In [49]:
#new data from BBC <https://www.bbc.com/news/world-latin-america-57844864>
news = ['Cuba: Customs on food and medicine lifted after unrest Cuba has temporarily lifted import duties on food, medicine and other essentials following recent unrest. As of next Monday, there will be no limit on such goods brought in by travellers until the end of the year. Thousands took to the streets on Sunday in protests over food and medicine shortages, price increases and the governments handling of Covid-19. One demand they had was for people arriving in Cuba to bring in supplies without paying customs duties. Dozens have been arrested nationwide since the unrest began on Sunday. Authorities confirmed on Tuesday that one man had died. Unauthorised demonstrations are illegal in the country, and anti-government protests are rare.']

#cleaning
news = [stemming(news[0])]

#tokenizing the data
news = vectorizer.transform(news)

#predicting
print(model_logistic_regression.predict(news))

[1]


##Acknowledgments

####Project 4. Fake News Prediction using Machine Learning with Python | Machine Learning Projects: <<https://www.youtube.com/watch?v=nacLBdyG6jE>>

Dataset: Fake News - Build a system to identify unreliable news articles. Kaggle <<https://www.kaggle.com/c/fake-news/data>>