In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd
import re

In [18]:
dataset = pd.read_csv("/content/1M_text_data.csv")

In [19]:
dataset.loc[len(dataset)-300, 'text']

'Readers, however, have the advantage of having an order easy. MP3 players, however, have the advantage of having easy controls. 0'

In [20]:
def clean_data(text):
  text = text.lower()
  text = re.sub("\s[0-9]+", ' ', text)
  text = text.strip()
  return text

In [21]:
for i in range(len(dataset)):
  dataset.loc[i, 'text'] = clean_data(dataset.loc[i, 'text'])

In [22]:
dataset.loc[len(dataset)-300, 'text']

'readers, however, have the advantage of having an order easy. mp3 players, however, have the advantage of having easy controls.'

In [23]:
text = [row.split() for row in dataset['text']]

In [24]:
dataset.loc[0, 'text']

'as she translates from one language to another , she tries to find the appropriate wording and context in english that would correspond to the work in spanish her poems and stories started to have differing meanings in their respective languages .'

In [25]:
text[:2]

[['as',
  'she',
  'translates',
  'from',
  'one',
  'language',
  'to',
  'another',
  ',',
  'she',
  'tries',
  'to',
  'find',
  'the',
  'appropriate',
  'wording',
  'and',
  'context',
  'in',
  'english',
  'that',
  'would',
  'correspond',
  'to',
  'the',
  'work',
  'in',
  'spanish',
  'her',
  'poems',
  'and',
  'stories',
  'started',
  'to',
  'have',
  'differing',
  'meanings',
  'in',
  'their',
  'respective',
  'languages',
  '.'],
 ['bdsm',
  'is',
  'solely',
  'based',
  'on',
  'consensual',
  'activities',
  ',',
  'and',
  'based',
  'on',
  'its',
  'system',
  'and',
  'laws',
  ',',
  'the',
  'concepts',
  'presented',
  'by',
  'de',
  'sade',
  'are',
  'not',
  'agreed',
  'upon',
  'the',
  'bdsm',
  'culture',
  ',',
  'even',
  'though',
  'they',
  'are',
  'sadistic',
  'in',
  'nature',
  '.']]

In [26]:
len(text)

1004809

In [27]:
type(text)

list

In [28]:
import itertools
def total_word_freq(df):
    words = list(itertools.chain.from_iterable(df))

    count = {}
    for word in words:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1

    return sorted(count.items(), key=lambda x: x[1], reverse=True)

In [29]:
vocab = total_word_freq(text)

In [30]:
len(vocab)

557293

In [31]:
vocab[:20]

[('the', 1972814),
 (',', 1799946),
 ('and', 1009815),
 ('.', 989872),
 ('of', 928224),
 ('in', 892480),
 ('a', 685601),
 ('to', 599434),
 ('is', 387376),
 ('was', 384182),
 ('for', 253433),
 ('-', 252612),
 ('as', 246762),
 ('on', 239966),
 ('by', 216946),
 (')', 216522),
 ('(', 216486),
 ('he', 214112),
 ('with', 210246),
 ('s', 203625)]

In [32]:
len(text)

1004809

In [33]:
temp_text = text[:100000]

In [34]:
from gensim.models import Word2Vec
model = Word2Vec(temp_text, min_count=5, vector_size= 150, workers=3, window =5, sg = 1)

In [35]:
unique_word_set = list(set(itertools.chain.from_iterable(temp_text)))

In [36]:
len(list(itertools.chain.from_iterable(temp_text)))

3290906

In [37]:
len(unique_word_set)

133458

In [38]:
model.wv['usa']

array([ 0.22296686, -0.31874025, -0.06112695, -0.20265666, -0.26753092,
       -0.0705813 ,  0.06146783,  0.07365308,  0.19051048,  0.01670003,
        0.44779512, -0.34610093, -0.16857114,  0.17267251, -0.27873546,
        0.35612884,  0.13980235,  0.26565483, -0.42984217,  0.23039801,
        0.08523444, -0.32039425,  0.18561332,  0.1783503 , -0.25610873,
       -0.02068506,  0.03264967,  0.1631113 ,  0.03569774, -0.14282   ,
       -0.3062895 , -0.14138497,  0.06804765, -0.09837397, -0.09548946,
        0.09006158,  0.14488457,  0.24952617,  0.26270643, -0.18763593,
        0.15081002, -0.22199169,  0.15160087,  0.00334152, -0.20068525,
       -0.11945431,  0.01565848,  0.2496184 , -0.04777727,  0.126607  ,
       -0.0291119 ,  0.04758203,  0.19196011,  0.01084789, -0.09202332,
       -0.1957692 , -0.11764976, -0.417695  , -0.039868  , -0.03820822,
       -0.00598139, -0.20065603, -0.1681458 , -0.2812024 , -0.13682453,
       -0.23939025, -0.44569153,  0.09733038, -0.32614473, -0.12

In [39]:
import numpy as np
from gensim import matutils
def similarity_cosine(vec1, vec2):
    cosine_similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    return cosine_similarity

In [40]:
similarity_cosine(model.wv['usa'], model.wv['dollar'])

0.46438578

In [41]:
similarity_cosine(model.wv['usa'], model.wv['nasa'])

0.42917553

In [42]:
similarity_cosine(model.wv['bangladesh'], model.wv['dhaka'])

0.76403755

In [43]:
model.wv.most_similar('usa')[:10]

[('americas', 0.7367455363273621),
 ('minneapolis', 0.7358592748641968),
 ('slovakia', 0.7353056073188782),
 ('u.k.', 0.7321981191635132),
 ('helsinki', 0.7321476936340332),
 ('venezuela', 0.7306073307991028),
 ('canada', 0.7265208959579468),
 ('shanghai', 0.725293755531311),
 ('uzbekistan', 0.723888635635376),
 ('zambia', 0.7213911414146423)]

In [44]:
model.wv.most_similar('football')[:10]

[('soccer', 0.8209167122840881),
 ('basketball', 0.8149868845939636),
 ('handball', 0.777335524559021),
 ('hockey', 0.7746652364730835),
 ('linebacker', 0.7648254036903381),
 ('cornerback', 0.7600213289260864),
 ('hc', 0.7587793469429016),
 ('a-league', 0.7482083439826965),
 ('rugby', 0.7468588352203369),
 ('canadiens', 0.7458853721618652)]

In [45]:
from gensim.models import Word2Vec
model = Word2Vec(temp_text, min_count=1,vector_size= 300,workers=2, window =5, sg = 0)

In [46]:
similarity_cosine(model.wv['bangladesh'], model.wv['dhaka'])

0.8974245

In [47]:
similarity_cosine(model.wv['usa'], model.wv['nasa'])

0.3671931

In [48]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])[0][0]

'queen'

In [49]:
model.wv.most_similar(positive=['india', 'usa'], negative=['bangladesh'])

[('australia', 0.8404116630554199),
 ('canada', 0.8118502497673035),
 ('europe', 0.8108347654342651),
 ('uk', 0.7487799525260925),
 ('japan', 0.743815004825592),
 ('ireland', 0.7189956307411194),
 ('america', 0.7171960473060608),
 ('africa', 0.7124066948890686),
 ('asia', 0.7048414349555969),
 ('south', 0.6976654529571533)]

In [50]:
def find_similarity(text1, text2):
  text1 = clean_data(text1)
  text2 = clean_data(text2)
  text1 = text1.split()
  text2 = text2.split()
  vector1 = 0
  vector2 = 0

  for word in text1:
    vector1 += model.wv[word]
  for word in text2:
    vector2 += model.wv[word]

  return similarity_cosine(vector1, vector2)

In [51]:
text1 = "Messi is a football player"
text2 = "You are beautiful and nice"

find_similarity(text1, text2)

0.32839355

In [52]:
text1 = "Messi is a football player"
text2 = "Messi has a great career"

find_similarity(text1, text2)

0.6265736

In [53]:
text1 = "Stop"
text2 = "Go as far as you can"

find_similarity(text1, text2)

0.6627214

In [54]:
text1 = "let us go"
text2 = "Go as far as you can"

find_similarity(text1, text2)

0.7056734