In [1]:
#word vectors occupy lot of space. Hence en_core_web_sm model do not have them included
#In order to download
#word vectors you need to install large or medium english model. we will install large model
#the command for large model to download  is python -m spacy download en_core_web_lg

import pandas as pd
#read the dataset 
df=pd.read_csv("Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [2]:
df.shape

(9900, 2)

In [3]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [4]:
df['label_num']=df['label'].map({'Fake':0,'Real':1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [5]:
# from spacy.cli import download
# download("en_core_web_sm")
#downloading the en_core_web_lg model of spacy
import spacy
nlp=spacy.load("en_core_web_lg")


In [6]:
doc=nlp("Top Trump Surrogate BRUTALLY Stabs Him ")
doc.vector.shape

(300,)

In [7]:
doc=nlp("Top Trump Surrogate BRUTALLY Stabs Him ")

for token in doc:
    print(token.text,"vector:",token.has_vector,"out of vocablary:",token.is_oov)

Top vector: True out of vocablary: False
Trump vector: True out of vocablary: False
Surrogate vector: True out of vocablary: False
BRUTALLY vector: False out of vocablary: True
Stabs vector: True out of vocablary: False
Him vector: True out of vocablary: False


In [8]:
base_token=nlp("bread")
base_token.vector.shape
doc=nlp("bread sandwitch burger car tiger human wheat")
for token in doc:
    print(f"{token.text}<-->{base_token.text}",token.similarity(base_token))

bread<-->bread 1.0
sandwitch<-->bread 0.0
burger<-->bread 0.4752069113758708
car<-->bread 0.06451533308853552
tiger<-->bread 0.04764611675903374
human<-->bread 0.2151154210812192
wheat<-->bread 0.6150360888607199


  print(f"{token.text}<-->{base_token.text}",token.similarity(base_token))


In [9]:
def print_similarity(base_word,word_to_compare):
    base_token=nlp(base_word)
    doc=nlp(word_to_compare)
    for token in doc:
        print(f"{token.text}<<--->>{base_token.text}:",token.similarity(base_token))
    

In [10]:
print_similarity("iphone","apple samsung iphone dog kitten")

apple<<--->>iphone: 0.4387907748060368
samsung<<--->>iphone: 0.670859081425417
iphone<<--->>iphone: 1.0
dog<<--->>iphone: 0.08211864228011527
kitten<<--->>iphone: 0.10222318459666081


In [11]:
king=nlp.vocab["king"].vector
queen=nlp.vocab["queen"].vector
woman=nlp.vocab['woman'].vector
man=nlp.vocab['man'].vector

result=king-man+woman
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([result],[queen])

array([[0.6178014]], dtype=float32)

In [12]:
#text classification
#this will take some time(approx 15 min)
df['vector']=df['Text'].apply(lambda text: nlp(text).vector)

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.vector.values,
                df.label_num,
                test_size=0.2,
                random_state=2022)

In [14]:
X_train.shape

(7920,)

In [15]:
X_test.shape

(1980,)

In [16]:
import numpy as np
X_train_2d=np.stack(X_train)
X_test_2d=np.stack(X_test)

In [17]:
from sklearn.naive_bayes import MultinomialNB
#our vector is in -ve also so multinominalNb doesn't support -ve values

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

scaled_train_embeded=scaler.fit_transform(X_train_2d)
scaled_test_embedded=scaler.fit_transform(X_test_2d)

clf=MultinomialNB()
clf.fit(scaled_train_embeded,y_train)
#our vector is in -ve also so multinominalNb doesn't support -ve values


MultinomialNB()

In [18]:
clf.predict(scaled_test_embedded)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
from sklearn.metrics import classification_report
y_pred=clf.predict(scaled_test_embedded)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1024
           1       0.94      0.95      0.94       956

    accuracy                           0.94      1980
   macro avg       0.94      0.95      0.94      1980
weighted avg       0.95      0.94      0.94      1980



In [20]:
#using knn classifier
from sklearn.neighbors import KNeighborsClassifier
#creating a knn model object
clf=KNeighborsClassifier(n_neighbors=5,metric='euclidean')

#fit with all_train_embedding and y_train
clf.fit(scaled_train_embeded,y_train)

#model prediction
y_pred=clf.predict(scaled_test_embedded)
print(y_pred)
#classification report
print(classification_report(y_test,y_pred))

[0 0 0 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1024
           1       0.99      0.99      0.99       956

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980

