# NLP Live Class - PWSkills
# Mentor - Dr Ayan Debnath
## class on 27th Dec 2023

# Word Embeddings using Spacy
# End-to-end project of Word Embeddings


In [None]:
# !pip install spacy
import spacy
!python -m spacy download en_core_web_lg

In [None]:
import spacy

# !pip install spacy
# pip install -U spacy

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
doc = nlp("dog cat banana jkfgsjg")

for token in doc:
  print(token.text, "vector:", token.has_vector, "OOV:", token.is_oov)

dog vector: True OOV: False
cat vector: True OOV: False
banana vector: True OOV: False
jkfgsjg vector: False OOV: True


In [None]:
# dog
doc[0].vector

In [None]:
doc[0].vector.shape

In [None]:
# cat
doc[1].vector

In [None]:
# jkfgsjg
doc[3].vector

In [None]:
base_token = nlp("sandwich")
base_token.vector.shape

(300,)

In [None]:
doc = nlp("bread sandwich burger car tiger human rice food")

for token in doc:
  print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> sandwich: 0.6341067177397708
sandwich <-> sandwich: 0.9999999823146835
burger <-> sandwich: 0.7900927756758959
car <-> sandwich: 0.1825516115154349
tiger <-> sandwich: 0.11234834480413391
human <-> sandwich: -0.042971604865238676
rice <-> sandwich: 0.48384042420843604
food <-> sandwich: 0.4348063691643022


In [None]:
def similar_words(base_word, word_to_compare):
  base_token = nlp(base_word)
  doc = nlp(word_to_compare)
  for token in doc:
      print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))


In [None]:
similar_words("iphone", "apple samsung iphone dog cat banana")

apple <-> iphone: 0.4387907401919904
samsung <-> iphone: 0.670859081425417
iphone <-> iphone: 1.000000072144752
dog <-> iphone: 0.08211864228011527
cat <-> iphone: 0.11430551522997741
banana <-> iphone: 0.191085460006539


In [None]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)

In [None]:
france = nlp.vocab["France"].vector
paris = nlp.vocab["Paris"].vector
berlin = nlp.vocab["Berlin"].vector
germany = nlp.vocab["Germany"].vector

result = france - paris + berlin

cosine_similarity([result], [germany])

# France - Paris + Berlin = Germany

array([[0.8555054]], dtype=float32)

# NLP: Text Classification using Spacy word Embeddings

# end-to-end project: fake and real news data

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
root_dir = "/content/drive/My Drive/Colab Notebooks/"
os.chdir(root_dir)

In [None]:
import pandas as pd
#read the dataset using pandas
filename = "fake_and_real_news.csv"
df = pd.read_csv(filename)

##### for local
# filepath = "/content/drive/My Drive/Colab Notebooks/"
# filename = "fake_and_real_news.csv"
# import os
# os.path.join(filepath,filename)
# df = pd.read_csv(os.path.join(filepath,filename))  #local

In [None]:
#print dataframe
print(df.head(10))

                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real
5  France says pressure needed to stop North Kore...  Real
6  Trump on Twitter (August 8): Opioid crisis, No...  Real
7   BUSTED: Trump Supporter Used Poll Watcher Cre...  Fake
8  Fatal Niger operation sparks calls for public ...  Real
9  Trump says he has 'great heart' for immigrant ...  Real


In [None]:
print(df.shape)

(9900, 2)


In [None]:
#check imbalance in data set
# check the distribution of data in our dataset
df['label'].value_counts()

# df["class"].value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [None]:
df['label_num'] = df['label'].map(
    {
     'Fake': 0,
     'Real':1
    }
)

In [None]:
print(df.head(10))

                                                Text label  label_num
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake          0
1  U.S. conservative leader optimistic of common ...  Real          1
2  Trump proposes U.S. tax overhaul, stirs concer...  Real          1
3   Court Forces Ohio To Allow Millions Of Illega...  Fake          0
4  Democrats say Trump agrees to work on immigrat...  Real          1
5  France says pressure needed to stop North Kore...  Real          1
6  Trump on Twitter (August 8): Opioid crisis, No...  Real          1
7   BUSTED: Trump Supporter Used Poll Watcher Cre...  Fake          0
8  Fatal Niger operation sparks calls for public ...  Real          1
9  Trump says he has 'great heart' for immigrant ...  Real          1


In [None]:
import spacy
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

In [None]:
# Convert text to vector

df['vector'] = df['Text'].apply(lambda text: nlp(text).vector)

In [None]:
len(df)

9900

In [None]:
print(df.head())

                                                Text label  label_num  \
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake          0   
1  U.S. conservative leader optimistic of common ...  Real          1   
2  Trump proposes U.S. tax overhaul, stirs concer...  Real          1   
3   Court Forces Ohio To Allow Millions Of Illega...  Fake          0   
4  Democrats say Trump agrees to work on immigrat...  Real          1   

                                              vector  
0  [-0.6759837, 1.4263071, -2.318466, -0.451093, ...  
1  [-1.8355803, 1.3101058, -2.4919677, 1.0268308,...  
2  [-1.9851209, 0.14389805, -2.4221718, 0.9133005...  
3  [-2.7812982, -0.16120885, -1.609772, 1.3624227...  
4  [-2.2010763, 0.9961637, -2.4088492, 1.128273, ...  


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (
    df.vector.values,
    df.label_num,
    test_size = 0.2,
    random_state = 42
)

In [None]:
X_train.shape

(7920,)

In [None]:
X_test.shape

(1980,)

In [None]:
import numpy as np

X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)


In [None]:
print(X_train_stack.shape)
print(X_test_stack.shape)

(7920, 300)
(1980, 300)


In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_stack_sc = scaler.fit_transform(X_train_stack)
X_test_stack_sc = scaler.transform(X_test_stack)


model.fit(X_train_stack_sc, y_train)

In [None]:
y_pred = model.predict(X_test_stack_sc)
y_pred

array([0, 1, 1, ..., 0, 1, 1])

In [None]:
y_test

8432    0
5680    1
4767    1
9218    1
621     0
       ..
9500    1
5858    1
7442    0
2846    1
1468    1
Name: label_num, Length: 1980, dtype: int64

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9474747474747475

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       973
           1       0.94      0.96      0.95      1007

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)



array([[913,  60],
       [ 44, 963]])

In [None]:
model = MultinomialNB(alpha = 0.1)
model.fit(X_train_stack_sc, y_train)

In [None]:
import numpy as np
sc = []
for alpha in np.arange(0,1,0.1):   # 0, 0.1, 0.2, 0.3, ....1
  model =  MultinomialNB(alpha = alpha)
  model.fit(X_train, y_train)
  model.predict(X_test)
  sc1 = accuracy_score(y_test, y_pred)
  sc.append(sc1)

In [None]:
# Gensim to conduct word2vec
import gensim.downloader as api
# This is a huge model (~1.6 gb) and it will take some time to load

wv = api.load('word2vec-google-news-300')

In [None]:
wv.similarity("great", "good")

0.729151

In [None]:
wv.similarity("profit", "loss")

0.34199455

In [None]:
wv.most_similar("good", topn = 3)

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708)]

In [None]:
wv.most_similar("Dog")

[('Puppy', 0.7045740485191345),
 ('dog', 0.6978708505630493),
 ('Dogs', 0.694400429725647),
 ('dogs', 0.6479228734970093),
 ('Pet', 0.622725784778595),
 ('Canine', 0.6188512444496155),
 ('Poodle', 0.6097930669784546),
 ('Pooches', 0.6097025275230408),
 ('Cat', 0.6061107516288757),
 ('Golden_Retriever', 0.5951071977615356)]

In [None]:
# King - man + woman = Queen

wv.most_similar(positive = ['King', 'woman'], negative = ['man'])

[('Queen', 0.5515626668930054),
 ('Oprah_BFF_Gayle', 0.47597548365592957),
 ('Geoffrey_Rush_Exit', 0.46460166573524475),
 ('Princess', 0.4533674716949463),
 ('Yvonne_Stickney', 0.4507041573524475),
 ('L._Bonauto', 0.4422135353088379),
 ('gal_pal_Gayle', 0.4408389925956726),
 ('Alveda_C.', 0.4402790665626526),
 ('Tupou_V.', 0.4373864233493805),
 ('K._Letourneau', 0.4351031482219696)]

In [None]:
# France - Paris + Berlin = Germany
wv.most_similar(positive = ['France', 'Berlin'], negative = ['Paris'], topn = 3)

[('Germany', 0.7901254892349243),
 ('Austria', 0.6026812195777893),
 ('German', 0.6004959940910339)]

In [None]:
wv.doesnt_match(["facebook", "google", "cat", "microsoft"])

'cat'

In [None]:
wv.doesnt_match(["facebook", "mouse", "cat", "dog"])

'facebook'

In [None]:
# Glove
glv = api.load('glove-twitter-25')



In [None]:
print(glv.most_similar("good"))
print(glv.most_similar("profit"))


[('too', 0.9648017287254333), ('day', 0.9533665180206299), ('well', 0.9503170847892761), ('nice', 0.9438973665237427), ('better', 0.9425962567329407), ('fun', 0.9418926239013672), ('much', 0.9413353800773621), ('this', 0.9387555122375488), ('hope', 0.9383506774902344), ('great', 0.9378516674041748)]
[('revenue', 0.9264836311340332), ('income', 0.9258537292480469), ('profits', 0.9127985239028931), ('trading', 0.9102286696434021), ('cost', 0.8918555974960327), ('rates', 0.8884955644607544), ('boost', 0.8879926800727844), ('increase', 0.8826181888580322), ('investment', 0.8813685774803162), ('minimum', 0.880243182182312)]


In [None]:
glv.doesnt_match(["facebook", "google", "cat", "microsoft"])

'cat'

In [None]:
glv.doesnt_match("facebook google cat microsoft".split())

'cat'