### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

### Load Data

In [2]:
true_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
fake_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")

#### Split Both dataframes into features and target 

In [3]:
true_df = true_df[["text"]]
true_df["target"] = 1


fake_df = fake_df[["text"]]
fake_df["target"] = 0


#### Concatenate true_df and fake_df

In [4]:
df = pd.concat([fake_df,true_df]).sample(frac=1).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,text,target
0,Speaking to Bill Maher about her participation...,1
1,How much more criminal activity are American v...,1
2,Conservatives talk the talk but can they walk ...,1
3,"The Democratic candidates for the presidency, ...",1
4,Just imagine how many brilliant young American...,1


### Clean Data

#### Drop duplicates

In [6]:
print(df.duplicated().sum())
df = df.drop_duplicates()
print(df.duplicated().sum())

6251
0


In [7]:
df.isna().sum()

text      0
target    0
dtype: int64

In [8]:
X = df["text"]
y = df["target"]

#### Clean data with regular expressions

In [9]:
def standard(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    #text = re.sub(" reuters ","",text)
    text = re.sub("  "," ",text)

    return text

In [10]:
X = X.apply(standard)

In [11]:
X

0        speaking to bill maher about her participation...
1        how much more criminal activity are american v...
2        conservatives talk the talk but can they walk ...
3        the democratic candidates for the presidency b...
4        just imagine how many brilliant young american...
                               ...                        
44891    washington reuters  mexico s economy minister ...
44892    at the democratic presidential debate on thurs...
44893    havana reuters  u s secretary of state john ke...
44894    donald trump broke his twitter silence over ja...
44895    a terror attack on the eu capital of brussels ...
Name: text, Length: 38647, dtype: object

In [12]:
nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

#### Tokenize and Lemmatize text

In [13]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def nltk_preprocess(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

In [14]:
X = X.apply(nltk_preprocess)

### Split Data

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42,train_size=0.2)
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((7729,), (30918,), (7729,), (30918,))

#### TF-IDF Vectorization

In [16]:
vectorized = TfidfVectorizer()
X_train_vectorized = vectorized.fit_transform(X_train)
X_val_vectorized = vectorized.transform(X_val)

### Fit Model

In [17]:
model = LogisticRegression(n_jobs=-1,C=1)
model.fit(X_train_vectorized, y_train)

In [18]:
prediction = model.predict(X_val_vectorized)

In [19]:
accuracy_score(y_val,prediction)

0.9758716605213791

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, prediction)

array([[16766,   238],
       [  508, 13406]])

#### Confusion Matrix & Classification Report

In [21]:
print(confusion_matrix(y_val, prediction))
print(classification_report(y_val, prediction))

[[16766   238]
 [  508 13406]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     17004
           1       0.98      0.96      0.97     13914

    accuracy                           0.98     30918
   macro avg       0.98      0.97      0.98     30918
weighted avg       0.98      0.98      0.98     30918

