In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [3]:
df.shape

(9900, 2)

In [4]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [5]:
df['label_num'] = df['label'].map({'Fake':0,'Real':1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [6]:
import spacy

In [7]:
nlp = spacy.load("en_core_web_lg")              # spacy contains pre trained large language model which contains vectors

In [8]:
df['vector'] = df['Text'].apply(lambda x: nlp(x).vector)
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-2.7812982, -0.16120885, -1.609772, 1.3624227..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ..."


In [9]:
 from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.vector.values, df.label_num, test_size = 0.2, random_state = 78)

In [11]:
X_train.shape

(7920,)

In [12]:
X_test.shape

(1980,)

In [13]:
import numpy as np

In [14]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [16]:
X_train_2d

array([[-2.2497077 ,  0.52243954, -1.6384922 , ..., -1.3262682 ,
        -1.6803137 ,  0.8599544 ],
       [-2.0666502 ,  0.6998031 , -1.9547237 , ..., -0.32801354,
        -2.9841065 ,  0.9544358 ],
       [-2.0507264 ,  0.5579803 , -2.1618252 , ..., -0.73270726,
        -2.2503192 ,  0.7855397 ],
       ...,
       [-1.0758398 ,  1.2097131 , -2.631059  , ...,  0.00690351,
        -3.8614478 ,  0.8534115 ],
       [-1.4785788 ,  1.0047289 , -2.6164465 , ..., -0.45229265,
        -2.826094  ,  1.2613564 ],
       [-1.7543446 ,  0.6456195 , -1.690278  , ..., -0.10018943,
        -2.5131733 ,  0.90269184]], dtype=float32)

## Using ML model now for classification

In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

In [18]:
scaler = MinMaxScaler()         # using minmaxscaler to scale our features i.e. vector between 0 to 1 as multinomialNB does not accept negative values

In [20]:
X_train_scaled = scaler.fit_transform(X_train_2d)
X_test_scaled = scaler.transform(X_test_2d)

In [21]:
clf = MultinomialNB()
clf.fit(X_train_scaled, y_train)

In [22]:
y_pred = clf.predict(X_test_scaled)

In [23]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1028
           1       0.94      0.96      0.95       952

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')

In [28]:
clf.fit(X_train_2d, y_train)

In [29]:
y_pred = clf.predict(X_test_2d)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1028
           1       0.99      0.99      0.99       952

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



Testing it on random news

In [36]:
doc = nlp("Thousands of people in Lebanon, including Palestinian refugees, continued to flee the widening conflict in the region")

In [39]:
y = doc.vector

In [46]:
doc1 = np.array(y).reshape(1,-1)

In [47]:
doc1.shape

(1, 300)

In [48]:
pred = clf.predict(doc1)

In [49]:
print(pred)

[1]
