## Importing Libraries and Data sets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df_real = pd.read_csv(r"True.csv",encoding='latin1',on_bad_lines='skip')
df_fake = pd.read_csv(r"Fake.csv",encoding='latin1',on_bad_lines='skip')

#### Adding labels to our datasetes 1 - for real news and 0 - for fake news,

In [3]:
df_real['Label'] = 1
df_fake['Label'] = 0

#### Cheacking data shapes

In [4]:
("Real data set: {}".format(df_real.shape) ,"Fake data set: {}".format(df_fake.shape))

('Real data set: (21416, 5)', 'Fake data set: (23481, 5)')

## Combining the data

In [5]:
df = pd.concat([df_real,df_fake])
df

Unnamed: 0,title,text,subject,date,Label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied Safe Zone Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [6]:
df = df.sample(frac=1).reset_index(drop=True)
df


Unnamed: 0,title,text,subject,date,Label
0,Kenya not at risk of constitutional crisis ahe...,NAIROBI (Reuters) - Kenya will not face a cons...,worldnews,"September 22, 2017",1
1,Suicide bombers attack Damascus police center:...,BEIRUT (Reuters) - Three men blew themselves u...,worldnews,"October 11, 2017",1
2,Attack on Trump: Mitt Romney Just Awoke a Sle...,21st Century Wire says Did Mitt Romney just ma...,Middle-east,"March 6, 2016",0
3,"Under fire, Trump's attorney general removes h...",WASHINGTON (Reuters) - U.S. Attorney General J...,politicsNews,"March 2, 2017",1
4,GAME ON! UC Berkeley BANS Ann Coulter From Imm...,THE BIRTHPLACE OF FREE SPEECH IS NOW OFFICIALL...,left-news,"Apr 19, 2017",0
...,...,...,...,...,...
44892,Thailand's political activity ban stays for no...,BANGKOK (Reuters) - Thailand s military govern...,worldnews,"October 31, 2017",1
44893,"Trump adviser, on Moscow visit, dodges questio...",MOSCOW (Reuters) - A foreign-policy adviser to...,politicsNews,"July 7, 2016",1
44894,WATCH: Donald Trump Fits In Perfectly As Vill...,Donald Trump s statements are frightening to t...,News,"February 22, 2016",0
44895,"'Brexit not a game,' EU's Barnier says","LONDON (Reuters) - Brexit is not a game, the...",worldnews,"October 10, 2017",1


## Cleaning the data

In [7]:
df.shape

(44897, 5)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44897 entries, 0 to 44896
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44897 non-null  object
 1   text     44897 non-null  object
 2   subject  44897 non-null  object
 3   date     44897 non-null  object
 4   Label    44897 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


#### Cheacking for null and duplicated values

In [9]:
df.isnull().sum()

title      0
text       0
subject    0
date       0
Label      0
dtype: int64

In [10]:
df.duplicated().sum()

209

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.duplicated().sum()

0

#### For our Data `title` and `subject` and `Label` columns are important 

In [13]:
new_df = df[['text','subject','Label']]
new_df

Unnamed: 0,text,subject,Label
0,NAIROBI (Reuters) - Kenya will not face a cons...,worldnews,1
1,BEIRUT (Reuters) - Three men blew themselves u...,worldnews,1
2,21st Century Wire says Did Mitt Romney just ma...,Middle-east,0
3,WASHINGTON (Reuters) - U.S. Attorney General J...,politicsNews,1
4,THE BIRTHPLACE OF FREE SPEECH IS NOW OFFICIALL...,left-news,0
...,...,...,...
44892,BANGKOK (Reuters) - Thailand s military govern...,worldnews,1
44893,MOSCOW (Reuters) - A foreign-policy adviser to...,politicsNews,1
44894,Donald Trump s statements are frightening to t...,News,0
44895,"LONDON (Reuters) - Brexit is not a game, the...",worldnews,1


In [14]:
# new_df['complete_text'] = new_df['text'] + " " + new_df['subject']
new_df['complete_text'] = new_df['text']
new_df

Unnamed: 0,text,subject,Label,complete_text
0,NAIROBI (Reuters) - Kenya will not face a cons...,worldnews,1,NAIROBI (Reuters) - Kenya will not face a cons...
1,BEIRUT (Reuters) - Three men blew themselves u...,worldnews,1,BEIRUT (Reuters) - Three men blew themselves u...
2,21st Century Wire says Did Mitt Romney just ma...,Middle-east,0,21st Century Wire says Did Mitt Romney just ma...
3,WASHINGTON (Reuters) - U.S. Attorney General J...,politicsNews,1,WASHINGTON (Reuters) - U.S. Attorney General J...
4,THE BIRTHPLACE OF FREE SPEECH IS NOW OFFICIALL...,left-news,0,THE BIRTHPLACE OF FREE SPEECH IS NOW OFFICIALL...
...,...,...,...,...
44892,BANGKOK (Reuters) - Thailand s military govern...,worldnews,1,BANGKOK (Reuters) - Thailand s military govern...
44893,MOSCOW (Reuters) - A foreign-policy adviser to...,politicsNews,1,MOSCOW (Reuters) - A foreign-policy adviser to...
44894,Donald Trump s statements are frightening to t...,News,0,Donald Trump s statements are frightening to t...
44895,"LONDON (Reuters) - Brexit is not a game, the...",worldnews,1,"LONDON (Reuters) - Brexit is not a game, the..."


## Cleaning Text

In [15]:
# importing important libraries

import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [16]:
def clean_text(text):
    # Changing text to lower case
    text_clean = text.lower()
    # Removing unwanted characters but keeping spaces and words
    text_clean = re.sub(r'[^a-zA-Z\s]', '', text_clean)
    
    # Remove links
    text = re.sub(r'http\S+', '', text)    
    
    # Tokenizing the words
    tokenize = word_tokenize(text_clean)
    # Removing stopwords and stemming the words
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    stemming = [stemmer.stem(word) for word in tokenize if word not in stop_words]
    cleaned_text = " ".join(stemming)
    
    return cleaned_text

In [17]:
new_df['clean_text'] = new_df['text'].apply(clean_text)

In [18]:
new_df

Unnamed: 0,text,subject,Label,complete_text,clean_text
0,NAIROBI (Reuters) - Kenya will not face a cons...,worldnews,1,NAIROBI (Reuters) - Kenya will not face a cons...,nairobi reuter kenya face constitut polit cris...
1,BEIRUT (Reuters) - Three men blew themselves u...,worldnews,1,BEIRUT (Reuters) - Three men blew themselves u...,beirut reuter three men blew near polic headqu...
2,21st Century Wire says Did Mitt Romney just ma...,Middle-east,0,21st Century Wire says Did Mitt Romney just ma...,st centuri wire say mitt romney make incred mi...
3,WASHINGTON (Reuters) - U.S. Attorney General J...,politicsNews,1,WASHINGTON (Reuters) - U.S. Attorney General J...,washington reuter us attorney gener jeff sessi...
4,THE BIRTHPLACE OF FREE SPEECH IS NOW OFFICIALL...,left-news,0,THE BIRTHPLACE OF FREE SPEECH IS NOW OFFICIALL...,birthplac free speech offici place oppress ber...
...,...,...,...,...,...
44892,BANGKOK (Reuters) - Thailand s military govern...,worldnews,1,BANGKOK (Reuters) - Thailand s military govern...,bangkok reuter thailand militari govern tuesda...
44893,MOSCOW (Reuters) - A foreign-policy adviser to...,politicsNews,1,MOSCOW (Reuters) - A foreign-policy adviser to...,moscow reuter foreignpolici advis us president...
44894,Donald Trump s statements are frightening to t...,News,0,Donald Trump s statements are frightening to t...,donald trump statement frighten see danger ris...
44895,"LONDON (Reuters) - Brexit is not a game, the...",worldnews,1,"LONDON (Reuters) - Brexit is not a game, the...",london reuter brexit game european union chief...


## Cheack through chatgpt if it is ok... if it is cleaning the data well

In [19]:
new_df['clean_text'][0]

'nairobi reuter kenya face constitut polit crisi even plan rerun presidenti elect set oct delay beyond end octob attorney gener said friday suprem court month annul presid uhuru kenyatta august elect win cite irregular order elect board organ new poll end octob kenyatta expect face opposit leader raila odinga odinga lead lawyer petit led invalid kenyatta reelect jame orengo said wednesday elect held end octob kenyatta term offic would ceas thrust countri deep constitut crisi govern offic legitim remain offic full forc constitut fresh elect complet new leader sworn githu muigai attorney gener told news confer delay poll delegitim constitut order day absolut chanc crisi around date elect take place insid elect cycl ad odinga nasa coalit said believ kenyatta term would end day suprem court rule sept nullifi august elect odinga said take part elect certain condit met includ remov elect board offici mose wetangula one nasa leader accus muigai misread constitut simpli mischiev wetangula told

In [20]:
new_df['clean_text'][0]

'nairobi reuter kenya face constitut polit crisi even plan rerun presidenti elect set oct delay beyond end octob attorney gener said friday suprem court month annul presid uhuru kenyatta august elect win cite irregular order elect board organ new poll end octob kenyatta expect face opposit leader raila odinga odinga lead lawyer petit led invalid kenyatta reelect jame orengo said wednesday elect held end octob kenyatta term offic would ceas thrust countri deep constitut crisi govern offic legitim remain offic full forc constitut fresh elect complet new leader sworn githu muigai attorney gener told news confer delay poll delegitim constitut order day absolut chanc crisi around date elect take place insid elect cycl ad odinga nasa coalit said believ kenyatta term would end day suprem court rule sept nullifi august elect odinga said take part elect certain condit met includ remov elect board offici mose wetangula one nasa leader accus muigai misread constitut simpli mischiev wetangula told

## Splitting the data

In [21]:
X = new_df['clean_text'].values
y = new_df['Label'].values

In [22]:
X

array(['nairobi reuter kenya face constitut polit crisi even plan rerun presidenti elect set oct delay beyond end octob attorney gener said friday suprem court month annul presid uhuru kenyatta august elect win cite irregular order elect board organ new poll end octob kenyatta expect face opposit leader raila odinga odinga lead lawyer petit led invalid kenyatta reelect jame orengo said wednesday elect held end octob kenyatta term offic would ceas thrust countri deep constitut crisi govern offic legitim remain offic full forc constitut fresh elect complet new leader sworn githu muigai attorney gener told news confer delay poll delegitim constitut order day absolut chanc crisi around date elect take place insid elect cycl ad odinga nasa coalit said believ kenyatta term would end day suprem court rule sept nullifi august elect odinga said take part elect certain condit met includ remov elect board offici mose wetangula one nasa leader accus muigai misread constitut simpli mischiev wetangu

In [23]:
y

array([1, 1, 0, ..., 0, 1, 1], dtype=int64)

In [24]:
# Fit the vectorizer only on the training data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(X)

# Split the data into training and test sets before vectorizing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [25]:
print(X_test)

  (0, 3172)	0.0870233321366273
  (0, 4375)	0.09346096639203631
  (0, 2799)	0.09045908249744908
  (0, 3166)	0.08209561296946435
  (0, 334)	0.17503811515449266
  (0, 703)	0.09841723900767932
  (0, 3880)	0.28113677121158004
  (0, 4827)	0.07738649065224525
  (0, 2244)	0.0756523049697635
  (0, 1842)	0.14536428853922656
  (0, 1878)	0.08190658188933861
  (0, 1103)	0.08590236682986224
  (0, 1075)	0.09241120237145782
  (0, 2057)	0.36896204324991955
  (0, 1021)	0.28169093245305904
  (0, 4608)	0.07007252706665879
  (0, 4851)	0.06731927206293617
  (0, 1214)	0.13635875693646982
  (0, 3247)	0.059799901631398054
  (0, 3028)	0.07105561013088026
  (0, 1355)	0.05223317869707448
  (0, 2490)	0.09301013465897397
  (0, 1268)	0.08492334952318409
  (0, 1795)	0.07777055523812831
  (0, 3892)	0.052080163230088766
  :	:
  (13406, 1401)	0.09496986213881191
  (13406, 791)	0.104907889536383
  (13406, 4221)	0.27760096752655555
  (13406, 3591)	0.11885028014378274
  (13406, 2574)	0.12738462661095182
  (13406, 2497)	0.0

## Model Selection

* **Here should understand the Various Classification models with default values from these models we can choose top 4 with Highest Accuracy score and proceed with HyperParameter Tuning**

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, precision_score, f1_score,recall_score,roc_auc_score,roc_curve

In [27]:
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true,predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    roc_auc = roc_auc_score(true,predicted)


    return acc,f1,precision,recall,roc_auc

In [28]:
# Initializing models which are neccessery for our model selection

models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    #"K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [42]:
models = {
    "K-Neighbors Classifier": KNeighborsClassifier(),
}

In [43]:
def evaluate_model(X_train, X_test, y_train, y_test, models):
    '''
    This function takes in X and y and models dictionary as input
    taking X and switching it into TFIDF vectors
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    models_list = []
    accuracy_list = []
    auc = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train,y_train)

        # makeing prediction
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # training set performances
        model_train_accuracy,model_train_f1,model_train_precision,\
        model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)

        # test set performances
        model_test_accuracy,model_test_f1,model_test_precision,\
        model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1)) 
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy)
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        auc.append(model_test_rocauc_score)
        print('='*35)
        print('\n')

    report = pd.DataFrame(list(zip(models_list, accuracy_list)), columns= ['Model Name', 'Accuracy']).sort_values(by = ['Accuracy'], ascending=False)

    return report

In [44]:
base_report = evaluate_model(X_train, X_test, y_train, y_test, models)

K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.7578
- F1 score: 0.6670
- Precision: 0.9554
- Recall: 0.5124
- Roc Auc Score: 0.7454
----------------------------------
Model performance for Test set
- Accuracy: 0.7160
- F1 score: 0.5960
- Precision: 0.9274
- Recall: 0.4391
- Roc Auc Score: 0.7039




In [45]:
base_report

Unnamed: 0,Model Name,Accuracy
0,K-Neighbors Classifier,0.715969


## XGBClassifier accuracy is highest among other classifiers

In [46]:
from sklearn.model_selection import cross_val_score, KFold

# Initialize the model
model = AdaBoostClassifier()

# Set up k-fold cross-validation
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=kf)

# Print cross-validation scores
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation score: {np.mean(scores)}")

Cross-validation scores: [0.99474155 0.99574849 0.99653166 0.99653127 0.99451718]
Mean cross-validation score: 0.9956140310684776


In [47]:
model = XGBClassifier()
model.fit(X_train,y_train)

In [48]:
train_predict = model.predict(X_train)
print(accuracy_score(y_train,train_predict))

1.0


In [49]:
test_predict = model.predict(X_test)
print(accuracy_score(y_test,test_predict))

0.9972402476318342


In [50]:
def predict_fake_or_real(text):
    cleaned_text = clean_text(text)
    
    text_tfidf = vectorizer.transform([cleaned_text])
    
    prediction = model.predict(text_tfidf)

    # Map prediction to label
    label = "fake" if prediction[0] == 0 else "real"
    
    return label
# Example usage:
input_text = 'iran say bias french stanc threaten middl east stabil worldnew'


prediction = predict_fake_or_real(input_text)
print("Prediction:", prediction)

Prediction: fake


In [51]:
new_df[new_df['Label'] == 1]['clean_text']

0        nairobi reuter kenya face constitut polit cris...
1        beirut reuter three men blew near polic headqu...
3        washington reuter us attorney gener jeff sessi...
6        milan reuter pope franci sunday call world lea...
7        london reuter britain princ georg greatgrandso...
                               ...                        
44891    moscow reuter russian presid vladimir putin su...
44892    bangkok reuter thailand militari govern tuesda...
44893    moscow reuter foreignpolici advis us president...
44895    london reuter brexit game european union chief...
44896    beirut reuter usback syrian democrat forc sdf ...
Name: clean_text, Length: 21210, dtype: object

In [52]:
new_df['clean_text'][1]

'beirut reuter three men blew near polic headquart central damascu wednesday kill two peopl injur six other state media said cite interior ministri islam state claim respons attack second month two suicid bomber tri storm polic center clash guard deton explos devic outsid khalid bin alwalid street damascu polic chief said polic forc chase third attack blew nearbi entranc cloth market investig ongo find came polic chief mohammad kheir ismail told state tv outsid headquart issu control islam state statement said three suicid bomber attack polic center machin gun explos belt islam state also claim respons earlier month similar suicid bomb attack polic station anoth part damascu peopl report dead damascu enjoy rel secur syria sixyear civil war rage nearbi across countri sever attack hit capit recent year includ car bomb kill peopl juli islam state tahrir alsham led milit formerli link al qaeda claim separ suicid blast kill score peopl damascu previous desper suicid attempt come respons vic

In [53]:
new_df[new_df['Label'] == 0]['clean_text']

2        st centuri wire say mitt romney make incred mi...
4        birthplac free speech offici place oppress ber...
5        annual white hous easter egg hunt presid trump...
8        hill releas controversi comment sore loser rin...
9        amateur presid donald trump sent threaten twee...
                               ...                        
44882    donald trump piss new york time mistak bit ass...
44883                                                     
44884    disrespect teacher sherman jr senior high scho...
44889    donald trump disastr decis take unit state par...
44894    donald trump statement frighten see danger ris...
Name: clean_text, Length: 23478, dtype: object

In [54]:
new_df['clean_text'][0]

'nairobi reuter kenya face constitut polit crisi even plan rerun presidenti elect set oct delay beyond end octob attorney gener said friday suprem court month annul presid uhuru kenyatta august elect win cite irregular order elect board organ new poll end octob kenyatta expect face opposit leader raila odinga odinga lead lawyer petit led invalid kenyatta reelect jame orengo said wednesday elect held end octob kenyatta term offic would ceas thrust countri deep constitut crisi govern offic legitim remain offic full forc constitut fresh elect complet new leader sworn githu muigai attorney gener told news confer delay poll delegitim constitut order day absolut chanc crisi around date elect take place insid elect cycl ad odinga nasa coalit said believ kenyatta term would end day suprem court rule sept nullifi august elect odinga said take part elect certain condit met includ remov elect board offici mose wetangula one nasa leader accus muigai misread constitut simpli mischiev wetangula told

In [55]:
new_df.columns

Index(['text', 'subject', 'Label', 'complete_text', 'clean_text'], dtype='object')