In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Import Part

In [15]:
import pandas as pd
import re
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [16]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load Data

In [17]:
data = pd.read_csv('TCP_sub.csv')
data.head()

Unnamed: 0,en_text,fa_text
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .


### Preprocessing

In [18]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'\d+','',text)
  text = re.sub(r'\W+',' ',text)
  text = re.sub(r'\b\w{1,2}\b','',text)
  text = re.sub(r'\s+',' ',text).strip()
  return text

In [19]:
data['processd_en_text'] = data['en_text'].apply(preprocess_text)

### Tokenization

In [20]:
data['tokenized_text'] = data['processd_en_text'].apply(word_tokenize)

### Remove Stop_Words

In [21]:
stop_words = set(stopwords.words('english'))
data['filtered_text'] = data['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])

### Model Training

In [22]:
model = Word2Vec(sentences=data['filtered_text'], vector_size=100, window=5, min_count=2, workers=4)

### Display the Word Vector

In [23]:
word = 'dad'
if word in model.wv.key_to_index:
    print(f"Word2Vec for '{word}': {model.wv[word]}")
else:
    print(f"The word '{word}' is not in the vocabulary.")

Word2Vec for 'dad': [-0.7677301   0.31126297  0.15634693 -0.6997101   0.02911668 -0.8789145
  0.585458    1.1214032  -1.0580837  -1.1488873  -0.45518035 -1.3678554
  0.31312615 -0.47694826  0.06051734  0.01373634  0.5198448  -1.1833769
 -0.20763342 -0.5941297  -0.14750057  0.4613096  -0.16817202 -1.4618368
 -0.23682126  0.34794766 -0.58607334  0.06438003 -0.80511427  0.9396548
  0.99665767  0.1361699  -0.42931354 -0.7047573  -0.80089974  0.7175156
 -0.6887299  -0.47141573 -0.27362853 -0.6626105  -0.09908677 -0.67249787
  0.22649324  0.03056374  0.40037885 -0.4054342  -0.70756483 -0.57896394
 -0.22554809 -0.54521734  0.24540995 -0.44160858  0.1397341   0.06742144
 -0.30966935  0.10766704  0.45741984 -0.03508357 -0.8588546  -0.4067503
 -0.40105486 -0.97857964  0.5281523   0.330094   -0.7866828   0.81732273
  0.00468749  0.1993788  -0.4197264   0.5276412  -0.32318825  0.3651291
  0.9651705  -0.48043516  0.6205012   0.2113672   0.23307216 -0.29409468
 -0.33037513 -1.0234779  -0.8973463   0

In [24]:
model.save("word2vec.model")

## Similar Words

In [25]:
similar_words = model.wv.most_similar('dad', topn=5)
print(" silimal words 'dad':", similar_words)

 silimal words 'dad': [('mom', 0.9473050236701965), ('daddy', 0.8601670265197754), ('walt', 0.8335169553756714), ('mum', 0.8324417471885681), ('alex', 0.8322091102600098)]


### Create DataFrame

In [26]:
words = list(model.wv.index_to_key)
word_vectors = [model.wv[word] for word in words]
df_word_vectors = pd.DataFrame(word_vectors, index=words)

### Save the DataFrame as a CSV file

In [27]:
df_word_vectors.to_csv('word_vectors.csv')