# NLP: Text Classification using Spacy word Embeddings

# end-to-end project: fake and real news data

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("fake_or_real_news.csv")


In [3]:
print(df.shape)

(6335, 4)


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
print(df['label'].unique())

['FAKE' 'REAL']


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
df = df[['text','label']]

In [8]:
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [9]:
print(df.shape)

(6335, 2)


In [10]:
#check for imbalance dataset
# check the distribution of data in our dataset
df['label'].value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

In [11]:
df['label'] = df['label'].str.lower()

# Create the 'label_num' column using the updated mapping dictionary
df['label_num'] = df['label'].map({'fake': 0, 'real': 1})

# Verify the result
print(df.head())

                                                text label  label_num
0  Daniel Greenfield, a Shillman Journalism Fello...  fake          0
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  fake          0
2  U.S. Secretary of State John F. Kerry said Mon...  real          1
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  fake          0
4  It's primary day in New York and front-runners...  real          1


In [12]:
# import spacy
# python -m spacy download en_core_web_lg/
# nlp = spacy.load("en_core_web_lg")

In [13]:
import spacy

In [14]:
nlp = spacy.load("en_core_web_lg")

In [15]:
df['vector'] = df['text'].apply(lambda text: nlp(text).vector)

In [16]:
len(df)

6335

In [17]:
df.head()

Unnamed: 0,text,label,label_num,vector
0,"Daniel Greenfield, a Shillman Journalism Fello...",fake,0,"[-1.3751823, 1.3421849, -2.3666484, 0.12908486..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,fake,0,"[-1.7449774, 0.93961924, -2.024867, 0.42536643..."
2,U.S. Secretary of State John F. Kerry said Mon...,real,1,"[-1.9426425, 1.0062195, -1.9992222, 0.20469022..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",fake,0,"[-1.9125352, -0.1481846, -1.1432766, 0.6861217..."
4,It's primary day in New York and front-runners...,real,1,"[-1.8516092, 1.3163909, -2.1726575, 1.2286776,..."


In [18]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(df.vector.values, df.label_num, test_size=0.3, random_state=42
                                                     )

In [19]:
X_train.shape

(4434,)

In [20]:
X_test.shape

(1901,)

In [21]:
import numpy as np

X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)



In [22]:
X_test_stack.shape

(1901, 300)

In [23]:
# from sklearn.naive_bayes import MultinomialNB

# model = MultinomialNB()

# model.fit(X_train,y_train)

## It will show an error because of negative value so we need to scale the value

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_stack_sc = scaler.fit_transform(X_train_stack)
X_test_stack_sc = scaler.fit_transform(X_test_stack)


model.fit(X_train_stack_sc,y_train)


In [27]:
y_pred = model.predict(X_test_stack_sc)
y_pred

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [28]:
y_test

1357    0
2080    0
2718    0
812     0
4886    0
       ..
332     0
833     0
5189    0
5290    1
2927    1
Name: label_num, Length: 1901, dtype: int64

In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.667017359284587

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.61      0.98      0.75       968
           1       0.94      0.34      0.50       933

    accuracy                           0.67      1901
   macro avg       0.77      0.66      0.63      1901
weighted avg       0.77      0.67      0.63      1901



In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[948,  20],
       [613, 320]], dtype=int64)

In [32]:
model = MultinomialNB(alpha=0.1)

In [26]:

model.fit(X_train_stack_sc,y_train)

### Gensim

In [33]:
!pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/ad/97/b8253236dfedb9094f4273393a3fd03997da81f27f15822e56128da894ae/gensim-4.3.2-cp311-cp311-win_amd64.whl.metadata
  Downloading gensim-4.3.2-cp311-cp311-win_amd64.whl.metadata (8.5 kB)
Downloading gensim-4.3.2-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB 220.2 kB/s eta 0:01:49
   ---------------------------------------- 0.0/24.0 MB 245.8 kB/s eta 0:01:38
   ---------------------------------------- 0.1/24.0 MB 598.8 kB/s eta 0:00:40
   ---------------------------------------- 0.2/24.0 MB 1.0 MB/s eta 0:00:24
    --------------------------------------- 0.5/24.0 MB 1.9 MB/s eta 0:00:13
   - --------------------------------------

In [34]:
# Gensim to conduct word2vec
import gensim.downloader as api 

In [35]:
wv = api.load('word2vec-google-news-300')



In [36]:
wv.similarity("good","great")

0.729151

In [37]:
wv.similarity("profit","loss")

0.34199455

In [38]:
wv.most_similar("good")

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348341941833),
 ('nice', 0.6836092472076416),
 ('excellent', 0.644292950630188),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728850364685),
 ('solid', 0.5806034803390503),
 ('lousy', 0.576420247554779)]

In [39]:
wv.most_similar("queen")

[('queens', 0.739944338798523),
 ('princess', 0.7070532441139221),
 ('king', 0.6510956883430481),
 ('monarch', 0.6383602023124695),
 ('very_pampered_McElhatton', 0.6357026696205139),
 ('Queen', 0.6163407564163208),
 ('NYC_anglophiles_aflutter', 0.6060680150985718),
 ('Queen_Consort', 0.5923796892166138),
 ('princesses', 0.5908074975013733),
 ('royal', 0.5637185573577881)]

In [40]:
wv.most_similar(positive=['king','queen'], negative=['men'])

[('monarch', 0.6135684847831726),
 ('princess', 0.5597445964813232),
 ('prince', 0.5126818418502808),
 ('kings', 0.511933445930481),
 ('royal', 0.5101755857467651),
 ('queens', 0.4952612817287445),
 ('Queen_Consort', 0.488391250371933),
 ('Eugene_Ionesco_absurdist_comedy', 0.4882142245769501),
 ('Makobo_Modjadji', 0.4837609529495239),
 ('crown_prince', 0.48335814476013184)]

In [41]:
wv.most_similar(positive=['king','queen'], negative=['men'],topn=3)


[('monarch', 0.6135684847831726),
 ('princess', 0.5597445964813232),
 ('prince', 0.5126818418502808)]

In [42]:
wv.doesnt_match(["facebook","cat","dog","hours"])

'facebook'

## Glove


In [None]:
# 