## FastText Embeddings
- Code to train fasttext embeddings on bengali and hindi code-mixed data

In [2]:
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 1.1MB/s eta 0:00:01
Building wheels for collected packages: fasttext
Failed to build fasttext
Installing collected packages: fasttext
  Running setup.py install for fasttext ... [?25ldone
[?25hSuccessfully installed fasttext-0.9.2


In [1]:
import fasttext
import pandas as pd

In [2]:
data_path = "Data_Processed/Shared_Task_hin/all.csv"

In [3]:
df = pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,ID,Text,Label
0,1,Bollywood film dekhne ke samay logic ghar mein...,0
1,2,Chutiya movie...,0
2,3,Us jaat bnde ka khene ka matlab tha mar daluga...,0
3,4,@Feminism Is CANCER *un feminist yeh sahi hai ...,0
4,5,Amrit Anand अब तो जुड़े ही है उनको बोलो जुड़ने,0


In [5]:
data = df['Text'].values

In [6]:
data[101]

'Agr tujhe Kabir Singh ka message smjh nahi aaya.....toh Joker khaa se smjh aa\ngayi...... Doctor m bhi hn....isse jaada bkchodi krte h hm😂'

## Clean Data

In [7]:
import re
import demoji

In [8]:
 def removeEmojis(text):
    return demoji.replace(text,repl="")

In [9]:
def removeUrls(text):
    result =re.sub(r"http\S+", "", text)
    return result

In [10]:
def removeSpecialChar(text):
#     s = re.sub(r'[^\w\s]','',text) # remove punctutations
    res = re.sub('[^a-zA-Z.\d\s]', '', text)
    return res

In [11]:
def replaceMultipleDotsByOneDot(text):
    res = re.sub(r'\.+', ".", text)
    return res

In [12]:
data_clean=[]
for sample in data:
    sample = replaceMultipleDotsByOneDot(sample)
    sample = removeEmojis(sample)
    sample = removeUrls(sample)
    sample = removeSpecialChar(sample)
    data_clean.append(sample)

In [13]:
data_clean[101]

'Agr tujhe Kabir Singh ka message smjh nahi aaya.toh Joker khaa se smjh aa\ngayi. Doctor m bhi hn.isse jaada bkchodi krte h hm'

## Tokenise

In [14]:
sent_tokens=[]

In [15]:
for sample in data_clean:
    for sent in sample.replace('\n','.').split('.'):
        words = sent.split(' ')
        sent_tokens.append([w for w in words if len(w)!=0])

In [16]:
len(sent_tokens),len(data_clean)

(17815, 6181)

In [17]:
sent_tokens[1023]

['SAHI', 'HAI', 'AGAR', 'HUME', 'WORLD', 'K', 'SAATH', 'AAGE', 'BADHNA', 'HAI']

In [18]:
maxV=0
maxI=0
for i,sent in enumerate(sent_tokens):
    if(len(sent)>maxV):
        maxV=len(sent)
        maxI=i

In [19]:
maxV,maxI

(20, 13663)

Max sentence length is now small which is good as this is normally the max length of sentences

## Train

In [22]:
from gensim.models import FastText

In [None]:
model = FastText(sent_tokens, size=300, window=5, workers=4,sg=1)

In [20]:
res_path = "Embeddings/hin_codemixed.model"

In [38]:
model.save(res_path)

In [23]:
model = FastText.load(res_path)

In [40]:
model.wv.most_similar("ladki")

[('ladka', 0.9974102973937988),
 ('ladko', 0.9958645105361938),
 ('ladkio', 0.9957106113433838),
 ('ladkia', 0.9944829940795898),
 ('ladke', 0.9934585094451904),
 ('ladkiya', 0.9902150630950928),
 ('ladkon', 0.9894428849220276),
 ('ladkiyo', 0.9855499863624573),
 ('galat', 0.9852472543716431),
 ('sakta', 0.983690619468689)]

In [24]:
res_path = "Embeddings/hin_codemixed.vec"

In [25]:
model.wv.save_word2vec_format(res_path)

## Bengali

In [43]:
data_path = "Data_Processed/Shared_Task_iben/all.csv"
df = pd.read_csv(data_path)
data = df['Text'].values

In [44]:
data_clean=[]
for sample in data:
    sample = replaceMultipleDotsByOneDot(sample)
    sample = removeEmojis(sample)
    sample = removeUrls(sample)
    sample = removeSpecialChar(sample)
    data_clean.append(sample)

In [45]:
sent_tokens=[]
for sample in data_clean:
    for sent in sample.replace('\n','.').split('.'):
        words = sent.split(' ')
        sent_tokens.append([w for w in words if len(w)!=0])

In [46]:
len(sent_tokens),len(data_clean)

(9936, 5971)

In [47]:
sent_tokens[1023]

['Ranu', 'mondol', 'kore', 'shudhu', 'gondogol']

In [48]:
maxV=0
maxI=0
for i,sent in enumerate(sent_tokens):
    if(len(sent)>maxV):
        maxV=len(sent)
        maxI=i

In [49]:
maxV,maxI

(20, 408)

In [50]:
model = FastText(sent_tokens, size=300, window=5, workers=4,sg=1)

In [51]:
res_path = "Embeddings/iben_codemixed.model"

In [52]:
model.save(res_path)

In [26]:
res_path = "Embeddings/iben_codemixed.model"
model = FastText.load(res_path)
res_path = "Embeddings/iben_codemixed.vec"
model.wv.save_word2vec_format(res_path)

## Encode

In [128]:
def encode_codemixed(self,data,max_len,model):
    new_data=[]

    for row in data:
        encoded=np.empty((0,300))
        words=row.split(' ')
        # as unknown is added after all words
#         unk_index = len(list(self.word2id.keys()))
        # and padding after that
#         pad_index = unk_index+1
        # to minimise extra loops, run only till max_len
        for word in words:
            word=word.lower()
            try:
                vec=model.wv[word]
            except KeyError:
                # unkown word
                vec=np.zeros((300))
            vec =np.expand_dims(vec,axis=0)
            encoded = np.append(encoded,vec,axis=0)
        new_data.append(np.mean(encoded,axis=0))
    return np.array(new_data)

In [4]:
model = FastText.load(res_path)

In [8]:
data_path = "Data_Processed/Shared_Task_iben/all.csv"
df = pd.read_csv(data_path)
data = df['Text'].values

In [13]:
import numpy as np

In [129]:
val = encode_codemixed('hi',data,128,model)

In [131]:
val.shape,len(data)

((5971, 300), 5971)