In [140]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [141]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [142]:
path = "/content/drive/MyDrive/Machine_Learning_Project/"

In [143]:
# printing the stowords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# **Data Pre-Processing**


In [144]:
# Load the fake and true news datasets from the specified path
df_fake = pd.read_csv(path + "Fake.csv")
df_true = pd.read_csv(path + "True.csv")

# Assign labels: 1 for fake news, 0 for true news
df_fake['label'] = 1
df_true['label'] = 0

# Combine both datasets into a single DataFrame and reset the index
df = pd.concat([df_fake, df_true], ignore_index=True)

In [145]:
df.shape

(44898, 5)

In [146]:
# print the first five rows of the dataframe
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [147]:
# counting the number of missing values in the dataset
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [148]:
# get stats on article lengths
df['text'].apply(lambda x: len(str(x))).describe()

Unnamed: 0,text
count,44898.0
mean,2469.109693
std,2171.617091
min,1.0
25%,1234.0
50%,2186.0
75%,3105.0
max,51794.0


In [149]:
# Remove articles shorter than 30 characters
df = df[df['text'].apply(lambda x: len(str(x)) > 30 )]

In [150]:
# Count duplicate rows
df.duplicated().sum()

np.int64(209)

In [151]:
# Drop duplicate rows
df = df.drop_duplicates()

In [152]:
# Check shape after cleaning
df.shape

(43987, 5)

In [153]:
# Count real vs fake labels
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,22777
0,21210


In [154]:
# Show unique values in the 'subject' column
df['subject'].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east', 'politicsNews', 'worldnews'], dtype=object)

In [155]:
# Combine 'subject' and 'title' into a new 'content' column
df['content'] = df['subject'] + ' ' + df['title']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = df['subject'] + ' ' + df['title']


In [156]:
df['content']

Unnamed: 0,content
0,News Donald Trump Sends Out Embarrassing New ...
1,News Drunk Bragging Trump Staffer Started Rus...
2,News Sheriff David Clarke Becomes An Internet...
3,News Trump Is So Obsessed He Even Has Obama’s...
4,News Pope Francis Just Called Out Donald Trum...
...,...
44893,worldnews 'Fully committed' NATO backs new U.S...
44894,worldnews LexisNexis withdrew two products fro...
44895,worldnews Minsk cultural hub becomes haven fro...
44896,worldnews Vatican upbeat on possibility of Pop...


# **Stemming:**

Stemming is the process of reducing a word to its ROOT word

In [157]:
port_stem = PorterStemmer()

In [158]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [159]:
df['content'] = df['content'].apply(stemming)

In [160]:
df['content']

Unnamed: 0,content
0,news donald trump send embarrass new year eve ...
1,news drunk brag trump staffer start russian co...
2,news sheriff david clark becom internet joke t...
3,news trump obsess even obama name code websit ...
4,news pope franci call donald trump christma sp...
...,...
44893,worldnew fulli commit nato back new u approach...
44894,worldnew lexisnexi withdrew two product chines...
44895,worldnew minsk cultur hub becom author
44896,worldnew vatican upbeat possibl pope franci vi...


In [161]:
X = df['content'].values
Y = df['label'].values

In [162]:
print(X)

['news donald trump send embarrass new year eve messag disturb'
 'news drunk brag trump staffer start russian collus investig'
 'news sheriff david clark becom internet joke threaten poke peopl eye'
 ... 'worldnew minsk cultur hub becom author'
 'worldnew vatican upbeat possibl pope franci visit russia'
 'worldnew indonesia buy billion worth russian jet']


In [163]:
print(Y)

[1 1 1 ... 0 0 0]


In [164]:
Y.shape

(43987,)

In [165]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [166]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 453375 stored elements and shape (43987, 13152)>
  Coords	Values
  (0, 3325)	0.4123077385047449
  (0, 3389)	0.2707759939900968
  (0, 3711)	0.3624006033185456
  (0, 3929)	0.46535759669684695
  (0, 7304)	0.3495647010553634
  (0, 7835)	0.23340958840723372
  (0, 7843)	0.11097649560814782
  (0, 10299)	0.35305651814735967
  (0, 11960)	0.11798229192842588
  (0, 13050)	0.2850101310161106
  (1, 1409)	0.39150903104238577
  (1, 2280)	0.4100683177539828
  (1, 3506)	0.4513150456787095
  (1, 5960)	0.3111294031612042
  (1, 7843)	0.10948543591444287
  (1, 9956)	0.2784602752816421
  (1, 10986)	0.4021649313680382
  (1, 11019)	0.3365498091986899
  (1, 11960)	0.11639710364958018
  (2, 996)	0.2883444361360382
  (2, 2135)	0.369926010932355
  (2, 2870)	0.3354951615066583
  (2, 4047)	0.2886078812777899
  (2, 5931)	0.2930535548131616
  (2, 6165)	0.31644588427279147
  :	:
  (43983, 9032)	0.3838855840970475
  (43983, 12065)	0.27439043360049953
  (4398

## Splitting the dataset to training & test data:

In [168]:
X_train, X_test , Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state= 2)

## Trainig the Model: Logistic Regression

In [170]:
model = LogisticRegression()

In [171]:
model.fit(X_train,Y_train)

## Evaluation

In [172]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction ,Y_train)

In [173]:
print('Accuracy score of the trainig data',training_data_accuracy )

Accuracy score of the trainig data 0.9999431640569496


In [175]:
# accuracy score on the testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction ,Y_test)
print('Accuracy score of the test data:', test_data_accuracy)

Accuracy score of the test data: 0.9998863378040463


### Making a predictive System

In [179]:
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)

if (prediction==0):
  print('The news is real')
else:
    print('The news is fake')

[0]
The news is real


In [180]:
print(Y_test[0])

0
