# Importing Dependencies

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### Data Loading 

In [2]:
df = pd.read_csv("news.csv")

### read first 5 rows

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

### Rename the "Unnamed: 0" to "number"

In [5]:
df.rename(columns={'Unnamed: 0':'number'},inplace=True)

In [6]:
df.head()

Unnamed: 0,number,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
df.tail()

Unnamed: 0,number,title,text,label
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


### Checking features name

In [8]:
df.columns

Index(['number', 'title', 'text', 'label'], dtype='object')

### Checking shape of dataset i.e (rows,columns)

In [9]:
df.shape

(6335, 4)

so, we have 6335 rows and 4 columns

### Checking Null values in dataset

In [10]:
df.isnull().sum()

number    0
title     0
text      0
label     0
dtype: int64

There is no any null value present in dataset

### Information of dataset

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   number  6335 non-null   int64 
 1   title   6335 non-null   object
 2   text    6335 non-null   object
 3   label   6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


## value counts of "label"

In [12]:
df['label'].value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

# Feature Engineering

In [13]:
# Dropping irrelevant columns to retain only necessary data 
df = df.drop(columns=['number', 'title'])

In [14]:
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


### Stemming -> converting the words to its root form

In [15]:
port_steam = PorterStemmer() 

In [16]:
def stemming(content):
    text = content.lower() 
    text = re.sub('[^a-zA-Z]',' ',content) # Replace all non-alphabetic characters in 'content' with space
    text = text.split()  # Converting to list
    text = [port_steam.stem(word) for word in text if not word in stopwords.words("english")] # Stem each word in 'text' also removing stopwords from the English language
    text = " ".join(text)
    return text

### Applying stemming to 'text' column while adding stemmed content to the 'stemmed_text'

In [17]:
df['stemmed_text'] = df['text'].apply(stemming)

In [18]:
df.head()

Unnamed: 0,text,label,stemmed_text
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,daniel greenfield shillman journal fellow free...
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,googl pinterest digg linkedin reddit stumbleup...
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,u s secretari state john f kerri said monday s...
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,kayde king kaydeek novemb the lesson tonight d...
4,It's primary day in New York and front-runners...,REAL,it primari day new york front runner hillari c...


In [19]:
# Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [20]:
df['encoded_label'] = le.fit_transform(df['label'])

In [21]:
df.head()

Unnamed: 0,text,label,stemmed_text,encoded_label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,daniel greenfield shillman journal fellow free...,0
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,googl pinterest digg linkedin reddit stumbleup...,0
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,u s secretari state john f kerri said monday s...,1
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,kayde king kaydeek novemb the lesson tonight d...,0
4,It's primary day in New York and front-runners...,REAL,it primari day new york front runner hillari c...,1


#### FAKE -> 0
#### REAL -> 1

## Splitting the data into training data & test data

In [22]:
X = df['stemmed_text']
Y = df['encoded_label']

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, stratify=Y, random_state=42)

In [24]:
print("Shapes:- ", X.shape, X_train.shape, X_test.shape)

Shapes:-  (6335,) (5068,) (1267,)


In [25]:
print("Shapes:- ", Y.shape, Y_train.shape, Y_test.shape)

Shapes:-  (6335,) (5068,) (1267,)


## vectorizing the text

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer()

In [27]:
X_train = vector.fit_transform(X_train)
X_test = vector.transform(X_test)

In [28]:
print("Shapes:- ", X.shape, X_train.shape, X_test.shape)

Shapes:-  (6335,) (5068, 39914) (1267, 39914)


In [29]:
print("Shapes:- ", Y.shape, Y_train.shape, Y_test.shape)

Shapes:-  (6335,) (5068,) (1267,)


# Model Training

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

### Initilizing the models

In [31]:
dt_model = DecisionTreeClassifier()
log_model = LogisticRegression()
svm_model = SVC(kernel='linear',random_state= 42, probability=True)

In [32]:
# Fit the DecisionTreeClassifier model on data
dt_model.fit(X_train,Y_train)

In [33]:
# Fit the LogisticRegression model on data
log_model.fit(X_train,Y_train)

In [34]:
# Fit the SVM model on data
svm_model.fit(X_train,Y_train)

#### importing metrics 

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## DecisionTreeClassifier

In [36]:
Y_pred_dt = dt_model.predict(X_test)
print(f"Accuracy Score: {accuracy_score(Y_test,Y_pred_dt):.2f}")
print(f"Confusion Matrix:- ")
confusion_matrix(Y_test,Y_pred_dt)

Accuracy Score: 0.82
Confusion Matrix:- 


array([[514, 119],
       [114, 520]])

Summary of Confusion Matrix:
- **51 is the True Negative (TN)**: The model **correctly predicted** the **negative class** (i.e., label 0) 51 times.
- **122 is the False Positive (FP)**: The model **incorrectly predicted** the **positive class** (i.e., label 1) when it should have been negative 122 times.
- **117 is the False Negative (FN)**: The model **incorrectly predicted** the **negative class** (i.e., label 0) when it should have been positive 117 times.
- **517 is the True Positive (TP)**: The model **correctly predicted** the **positive class** (i.e., label 1) 517 times.


In [37]:
print("Classification Report:- ")
print(classification_report(Y_test,Y_pred_dt))

Classification Report:- 
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       633
           1       0.81      0.82      0.82       634

    accuracy                           0.82      1267
   macro avg       0.82      0.82      0.82      1267
weighted avg       0.82      0.82      0.82      1267



## LogisticRegression

In [38]:
Y_pred_log  = log_model.predict(X_test) 
print(f"Accuracy Score: {accuracy_score(Y_test, Y_pred_log):.2f}")
print(f"Confusion Matrix:- ")
confusion_matrix(Y_test,Y_pred_log)

Accuracy Score: 0.93
Confusion Matrix:- 


array([[602,  31],
       [ 55, 579]])

Summary of confusion Matrix:
- **602 is the True Negative (TN)**: The model **correctly predicted** the **negative class** (i.e., class 0) 602 times.
- **31 is the False Positive (FP)**: The model **incorrectly predicted** the **positive class** (i.e., class 1) when it should have been negative 31 times.
- **55 is the False Negative (FN)**: The model **incorrectly predicted** the **negative class** (i.e., class 0) when it should have been positive 55 times.
- **579 is the True Positive (TP)**: The model **correctly predicted** the **positive class** (i.e., class 1) 579 times.


In [39]:
print("Classification Report:- ")
print(classification_report(Y_test,Y_pred_log))

Classification Report:- 
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       633
           1       0.95      0.91      0.93       634

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267



## SVM

In [40]:
Y_pred_svm = svm_model.predict(X_test)
print(f"Accuracy Score: {accuracy_score(Y_test, Y_pred_svm):.2f}")
print(f"Confusion Matrix:- ")
confusion_matrix(Y_test,Y_pred_svm)

Accuracy Score: 0.94
Confusion Matrix:- 


array([[600,  33],
       [ 41, 593]])

Summary of Confusion Matrix:
- **600 is the True Negative (TN)**: The model **correctly predicted** the **negative class** (i.e., class 0) 600 times.
- **33 is the False Positive (FP)**: The model **incorrectly predicted** the **positive class** (i.e., class 1) when it should have been negative 33 times.
- **41 is the False Negative (FN)**: The model **incorrectly predicted** the **negative class** (i.e., class 0) when it should have been positive 41 times.
- **593 is the True Positive (TP)**: The model **correctly predicted** the **positive class** (i.e., class 1) 593 times.


In [41]:
print("Classification Report:- ")
print(classification_report(Y_test,Y_pred_svm))

Classification Report:- 
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       633
           1       0.95      0.94      0.94       634

    accuracy                           0.94      1267
   macro avg       0.94      0.94      0.94      1267
weighted avg       0.94      0.94      0.94      1267



In [44]:
import joblib

joblib.dump(dt_model,"fake_news_detection_system_dt.pkl")
print(" 'fake_news_detection_system_df.pkl' exported successfully")

joblib.dump(log_model,"fake_news_detection_system_logistic.pkl")
print(" 'fake_news_detection_system_logistic.pkl' exported successfully")

joblib.dump(svm_model,"fake_news_detection_system_svm.pkl")
print(" 'fake_news_detection_system_svm.pkl' exported successfully")

 'fake_news_detection_system_df.pkl' exported successfully
 'fake_news_detection_system_logistic.pkl' exported successfully
 'fake_news_detection_system_svm.pkl' exported successfully


In [45]:
joblib.dump(vector,"vectorizer.pkl")
print(" 'vectorizer.pkl' exported successfully ")

 'vectorizer.pkl' exported successfully 


In [48]:
df.to_csv("news_training_dataset.csv",index=False)
print(" 'news_training_dataset.csv' exported successfully")

 'news_training_dataset.csv' exported successfully


In [49]:
# df1 = df.drop(columns=['label',	'stemmed_text',	'encoded_label'])

In [51]:
# df1.to_csv("news_content.csv",index=False)

## Accuracy Score of each models:-

- **DecisionTreeClassifier** :-  **81% accuracy**
- **LogisticRegression** :- **93% accuracy**
- **SVM** :- **94% accuracy**