#**Exploring Text Similarity with Word2Vec**

**Our goal is to process text files, tokenize sentences, and train a Word2Vec model to understand word relationships.**

In [None]:
from google.colab import files



In [None]:

import os

# List files in the current directory
print(os.listdir())


['.config', 'Friends_Transcript.txt', 'sample_data']


In [None]:
!pip install nltk gensim
import nltk
nltk.download('punkt')
import os
import gensim
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess




story = []

# Directory containing the text files
directory = "."  # Uses current directory

# Iterating over all files in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):  # Check if it is a file
        with open(filepath, 'r', encoding='utf-8') as f:
            corpus = f.read()
            raw_sentences = sent_tokenize(corpus)

            for sent in raw_sentences:
                processed_sentence = simple_preprocess(sent)
                story.append(processed_sentence)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
raw_sentences

['THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\nWritten by: Marta Kauffman & David Crane\n[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]',
 "Monica: There's nothing to tell!",
 "He's just some guy I work with!",
 "Joey: C'mon, you're going out with the guy!",
 "There's gotta be something wrong with him!",
 'Chandler: All right Joey, be nice.',
 'So does he have a hump?',
 'A hump and a hairpiece?',
 'Phoebe: Wait, does he eat chalk?',
 '(They all stare, bemused.)',
 "Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!",
 'Monica: Okay, everybody relax.',
 'This is not even a date.',
 "It's just two people going out to dinner and- not having sex.",
 'Chandler: Sounds like a date to me.',
 "[Time Lapse]\nChandler: Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked.",
 'All: Oh, yeah.',
 'Had that dream.',
 "Chandler: Then I look down, and I re

In [None]:
story

[['the',
  'one',
  'where',
  'monica',
  'gets',
  'new',
  'roomate',
  'the',
  'pilot',
  'the',
  'uncut',
  'version',
  'written',
  'by',
  'marta',
  'kauffman',
  'david',
  'crane',
  'scene',
  'central',
  'perk',
  'chandler',
  'joey',
  'phoebe',
  'and',
  'monica',
  'are',
  'there'],
 ['monica', 'there', 'nothing', 'to', 'tell'],
 ['he', 'just', 'some', 'guy', 'work', 'with'],
 ['joey', 'mon', 'you', 're', 'going', 'out', 'with', 'the', 'guy'],
 ['there', 'gotta', 'be', 'something', 'wrong', 'with', 'him'],
 ['chandler', 'all', 'right', 'joey', 'be', 'nice'],
 ['so', 'does', 'he', 'have', 'hump'],
 ['hump', 'and', 'hairpiece'],
 ['phoebe', 'wait', 'does', 'he', 'eat', 'chalk'],
 ['they', 'all', 'stare', 'bemused'],
 ['phoebe',
  'just',
  'cause',
  'don',
  'want',
  'her',
  'to',
  'go',
  'through',
  'what',
  'went',
  'through',
  'with',
  'carl',
  'oh'],
 ['monica', 'okay', 'everybody', 'relax'],
 ['this', 'is', 'not', 'even', 'date'],
 ['it',
  'just',
 

In [None]:
model = gensim.models.Word2Vec(
    window=10,  #sets the context window size,model considers up to 10 words to the left and right of the target word
    min_count=2,        # ignore words that appear less than twice in the corpus
    workers=4,      #sets the number of worker threads to use for training
)

In [None]:
model.build_vocab(story)




In [None]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(3042559, 4254775)

In [None]:
model.wv.most_similar("ross")

[('joey', 0.8753852248191833),
 ('chandler', 0.8309726119041443),
 ('phoebe', 0.8145551085472107),
 ('monica', 0.8107759952545166),
 ('rachel', 0.7853626012802124),
 ('emily', 0.6738545298576355),
 ('carol', 0.621108889579773),
 ('ben', 0.6137596964836121),
 ('mike', 0.609252393245697),
 ('susan', 0.6009106636047363)]

In [None]:
model.wv.doesnt_match(["joey","rachel","monica","phoebe"])

'joey'

In [None]:
model.wv["joey"]

array([-0.03982791,  0.0789012 ,  0.1661906 ,  0.46963394, -0.29475117,
       -0.40095693,  0.786658  , -0.5009704 , -0.90932506, -0.30557838,
        0.9463087 , -0.6026317 , -0.43420902,  0.53531754, -0.21395701,
       -0.38065207, -0.11991013,  0.68661976, -0.38052225, -0.67020065,
       -0.20620646,  0.30186856,  0.8353434 ,  0.46489197, -1.5154833 ,
        0.07942422,  0.2961073 ,  0.9189718 ,  0.1054296 , -0.55335456,
       -0.78774726, -0.21330383,  0.16962215, -0.21665248, -0.13063186,
        0.34075788,  0.44465983, -0.48888466, -1.7530037 ,  1.0281378 ,
       -0.19829395,  0.38645965, -0.04813492, -0.6074744 ,  0.62082636,
       -0.4044149 , -0.5054679 , -0.715272  ,  0.5955966 ,  0.33104038,
       -0.5811613 , -0.36103442, -0.13495027, -0.50109154, -1.2168103 ,
        0.11386879,  0.1091212 , -0.3130813 , -0.01365322, -1.2520393 ,
       -0.36569327, -0.78526646,  0.34110406,  0.00380628, -1.6474432 ,
        0.24407656, -1.2074506 ,  1.0360148 , -0.98179775,  0.61

In [None]:
model.wv.similarity("chandler","joey")

0.8440256