# Importing the dataset

In [1]:
import string
import time


# ~~~~~~~~~~~~~~~IMPORTING THE DATASET~~~~~~~~~~~~~~~~
start = time.process_time() 
print("Reading the file .......")
f = open("../input/nepdata/clean.txt" , encoding= 'utf-8' , buffering= 10000)
lines = f.read().strip().split(u"।")
sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in lines]
f.close()
print(f"Total number of lines in text file {len(sentences)}")
print(f"Time required to read the file {time.process_time() - start}")

Reading the file .......
Total number of lines in text file 5891518
Time required to read the file 101.750362739


# Processing Dataset for Training

In [2]:
!pip install snowballstemmer

Collecting snowballstemmer
  Downloading snowballstemmer-2.1.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 665 kB/s 
[?25hInstalling collected packages: snowballstemmer
Successfully installed snowballstemmer-2.1.0


In [3]:
# ~~~~~~~~~~~~~~ Getting the dataset ready for training word2vec model ~~~~~~~~~~
import re
import snowballstemmer 
mainlist = list()
class Main_Data_list:
    def __init__(self, dataset):
        self.dataset = dataset
        self.stop_word_list = []
        self.mainlist = []
        
        a_file = open("../input/stopwords/stopwords.txt", "r" ,encoding= 'utf-8')
        for line in a_file:
            stripped_line = line.strip()
            self.stop_word_list.append(stripped_line)
        a_file.close()
        
        self.stemmer = snowballstemmer.NepaliStemmer()
        
        
    def simple_tokenizer(self,text) -> list:
        
        line = re.sub('[।]',"", text)
        
        devanagari_range = r'[\u0900-\u097F\\]'
        def getDevanagariCharCount(token):
            return len(list(filter(lambda char: re.match(devanagari_range, char), (char for char in token))))
        def isDevanagari(token):
            return True if getDevanagariCharCount(token) >= len(token)/2 else False 

        tokens = list(filter(lambda t: isDevanagari(t), line.split(" ")))
        return tokens

    def get(self):
        for i,line in enumerate(self.dataset[0:2000000]):
            
            wordsList = self.simple_tokenizer(line)
            words = [w for w in wordsList if not w in self.stop_word_list]
            words  = self.stemmer.stemWords(words)
            if len(words) > 3:
                self.mainlist.append(words)
            if i % 100000 == 0:
                print(f"DONE FOR {i/100000} LAKHS LINES")
        return self.mainlist
                
final = Main_Data_list(sentences)
mainlist = final.get()

DONE FOR 0.0 LAKHS LINES
DONE FOR 1.0 LAKHS LINES
DONE FOR 2.0 LAKHS LINES
DONE FOR 3.0 LAKHS LINES
DONE FOR 4.0 LAKHS LINES
DONE FOR 5.0 LAKHS LINES
DONE FOR 6.0 LAKHS LINES
DONE FOR 7.0 LAKHS LINES
DONE FOR 8.0 LAKHS LINES
DONE FOR 9.0 LAKHS LINES
DONE FOR 10.0 LAKHS LINES
DONE FOR 11.0 LAKHS LINES
DONE FOR 12.0 LAKHS LINES
DONE FOR 13.0 LAKHS LINES
DONE FOR 14.0 LAKHS LINES
DONE FOR 15.0 LAKHS LINES
DONE FOR 16.0 LAKHS LINES
DONE FOR 17.0 LAKHS LINES
DONE FOR 18.0 LAKHS LINES
DONE FOR 19.0 LAKHS LINES


# Training

In [4]:
import gensim

model = gensim.models.Word2Vec(
    vector_size = 200 ,
    window=  5,
    min_count=2,
    workers= 4
)

model.build_vocab(mainlist, progress_per=1000 )

model.train(mainlist, total_examples= model.corpus_count, epochs= model.epochs)

(103822776, 122202960)

# Testing

In [5]:
model.wv.most_similar('ठमेल')

[('लेकसाइड', 0.7085685729980469),
 ('जमल', 0.7031379342079163),
 ('बानेश्वर', 0.6600849628448486),
 ('सामाखुसी', 0.6546829342842102),
 ('न्युरोड', 0.6507934927940369),
 ('गोंगबु', 0.6498370170593262),
 ('बागबजार', 0.6398636102676392),
 ('कलं', 0.6395446062088013),
 ('हाइसन्चो\n', 0.6294294595718384),
 ('घण्टाघर', 0.6282877922058105)]

In [6]:
model.wv.most_similar('चितवन')

[('\nचितवन', 0.6697115898132324),
 ('भरतपुर', 0.6169254183769226),
 ('रूपन्देही', 0.5702300071716309),
 ('सौराहा', 0.5635057687759399),
 ('रत्ननगर', 0.5632916688919067),
 ('पोखरा', 0.5580011010169983),
 ('कास्', 0.5571740865707397),
 ('काभ्रे', 0.553877055644989),
 ('पटिहानी', 0.548678994178772),
 ('नारायणगढ', 0.5401238799095154)]

In [7]:
model.save("nepaliW2V_5Million.model")