In [3]:
# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.

import os
import pandas as pd
from tqdm.auto import tqdm
import re
import pickle
import time, datetime
from nltk import ngrams
from collections import Counter, defaultdict

tqdm.pandas()
DIRECTORY = "E:/Scrapped-Data/TED-Talks"

def timestamp():
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S:')
    return st
print(timestamp(), 'Libraries Imported')

NOT_WORDS_REGEX = re.compile(r"[^a-zA-Z]")
EXTRAWHITESPACES_REGEX = re.compile(r"\s+")

data = ' '.join(pd.read_csv(DIRECTORY+'/transcripts.csv')['transcript'].progress_apply(lambda x : EXTRAWHITESPACES_REGEX.sub(' ',
                                                                                NOT_WORDS_REGEX.sub(' ', x))).values)
print(timestamp(), 'Data Read', len(data))

class LanguageModel():
    def __init__(self, n, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol=''):
        self.model = defaultdict(lambda: defaultdict(lambda: 0))
        self.n = n
        self.pad_left= pad_left
        self.pad_right= pad_right
        self.left_pad_symbol= left_pad_symbol 
        self.right_pad_symbol= right_pad_symbol
        
        
    def train(self, dataString):
        word_grams = ngrams(dataString.split(), self.n+1,
                            pad_left=self.pad_left, pad_right=self.pad_right, 
                            left_pad_symbol=self.left_pad_symbol, right_pad_symbol=self.right_pad_symbol)
        for w in word_grams:
            self.model[w[:-1]][w[-1]] += 1
        for w1 in self.model:
            total_count = float(sum(self.model[w1].values()))
            for w2 in self.model[w1]:
                self.model[w1][w2] /= total_count
    
    def predict(self, query):
        try:
            return dict(self.model[query])
        except Exception as e:
            print(e)
            return ''
        
mypybot = LanguageModel(n=3)
mypybot.train(data)
print(timestamp(), 'Model Trained')
s = pickle.dumps(mypybot)
print(timestamp(), 'Model Pickled')

22:49:20: Libraries Imported


HBox(children=(IntProgress(value=0, max=2467), HTML(value='')))


22:49:28: Data Read 27313338
22:49:59: Model Trained


AttributeError: Can't pickle local object 'LanguageModel.__init__.<locals>.<lambda>'

In [4]:
import dill

In [None]:
dill.dump(mypybot, open('model.pkl', 'wb'))