In [1]:
import pandas as pd 
df = pd.read_csv("Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [2]:
df.shape

(9900, 2)

In [4]:
#Check for Class Imbalance
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [8]:
df['label_num'] = df['label'].map({'Fake':0,'Real':1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [9]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [11]:
doc = nlp("Top Trump Surrogate BRUTALLY Stabs Him In")
doc.vector.shape

(300,)

In [13]:
#Creating a new column that will store vector for that particulat text
df['vector'] = df['Text'].apply(lambda x: nlp(x).vector)

In [14]:
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-2.7812982, -0.16120885, -1.609772, 1.3624227..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ..."


In [21]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    df.vector.values,
    df.label_num,
    test_size = 0.2,
    random_state = 98
)

In [22]:
# Numpy array in which every single individual element is in a numpy array
X_train

array([array([-1.6052418 ,  0.67032284, -1.9033786 ,  0.38504857,  3.441447  ,
               0.20944585,  0.0059732 ,  3.5965939 , -0.41371474, -1.0985126 ,
               4.7261887 ,  1.0981082 , -3.2211878 ,  0.33545378,  0.649115  ,
               1.1635342 ,  1.1370496 , -0.4613987 , -1.2778668 , -0.9091688 ,
               0.6667164 , -0.7565664 , -0.6853123 , -0.21836425, -0.6291739 ,
              -1.2228858 , -1.8228883 , -0.3579995 , -0.60482174,  0.6597367 ,
               0.9672441 , -0.0536291 , -0.6991517 , -0.5075506 , -2.4084322 ,
              -1.0284741 , -0.70299256,  0.70164376,  1.2168417 ,  0.6972662 ,
               0.30661315,  0.14317988, -0.46401045,  0.3936543 , -1.9006711 ,
               0.8572577 ,  1.0290703 , -2.4141116 , -1.1864084 ,  1.9615806 ,
              -0.9855822 ,  0.704382  ,  0.67901635, -3.788922  , -0.7551517 ,
               0.702619  , -0.26371425,  0.70975596,  0.8767877 , -0.20113534,
               0.69811416, -0.7391904 , -0.1428171 ,

In [23]:
import numpy as np
#Convert into 2-D array
X_train_2D = np.stack(X_train)
X_test_2D = np.stack(X_test)

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

#Negative values are not acceptable in MultiNomialNB so we need to sacle the values
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2D)
scaled_test_embed = scaler.transform(X_test_2D)

clf = MultinomialNB()
clf.fit(scaled_train_embed,y_train)

In [27]:
from sklearn.metrics import classification_report
#Evaluate performance on the test dataset
y_pred = clf.predict(scaled_test_embed)

classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.95      0.94      0.95      1009\n           1       0.94      0.95      0.94       971\n\n    accuracy                           0.95      1980\n   macro avg       0.95      0.95      0.95      1980\nweighted avg       0.95      0.95      0.95      1980\n'

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create a Random Forest model object
clf = RandomForestClassifier(n_estimators=100)  # You can specify the number of trees (estimators) as needed

# Fit the model with X_train_2D and y_train
clf.fit(X_train_2D, y_train)

# Get the predictions for X_test_2D and store it in y_pred
y_pred = clf.predict(X_test_2D)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1009
           1       0.99      1.00      0.99       971

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980

