In [1]:
# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.

import os
import pandas as pd
from tqdm.auto import tqdm
import re
import pickle
import time, datetime
from nltk import ngrams
from collections import Counter, defaultdict

def timestamp():
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S:')
    return st
print(timestamp(), 'Libraries Imported')

with open('../input/python_code.txt','r') as f:
    data = f.read()
print(timestamp(), 'File Read', len(data))

class LanguageModel():
    def __init__(self, n, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol=''):
        self.model = defaultdict(lambda: defaultdict(lambda: 0))
        self.n = n
        self.pad_left= pad_left
        self.pad_right= pad_right
        self.left_pad_symbol= left_pad_symbol 
        self.right_pad_symbol= right_pad_symbol
        
        
    def train(self, dataString):
        word_grams = ngrams(dataString.split(), self.n+1,
                            pad_left=self.pad_left, pad_right=self.pad_right, 
                            left_pad_symbol=self.left_pad_symbol, right_pad_symbol=self.right_pad_symbol)
        for w in word_grams:
            self.model[w[:-1]][w[-1]] += 1
        for w1 in self.model:
            total_count = float(sum(self.model[w1].values()))
            for w2 in self.model[w1]:
                self.model[w1][w2] /= total_count
    
    def predict(self, query):
        try:
            return dict(self.model[query])
        except Exception as e:
            print(e)
            return ''

mypybot = LanguageModel(n=3)
mypybot.train(data)
print(timestamp(), 'Model Trained')
s = pickle.dumps(mypybot)
print(timestamp(), 'Model Pickled')

14:20:43: Libraries Imported


In [2]:
DOCSTRING_REGEX = re.compile(r"\"\"\"(.|\n)*?\"\"\"")
COMMENT_REGEX = re.compile(r"#.*")
NOT_WORDS_REGEX = re.compile(r"[^a-zA-Z]")
EXTRAWHITESPACES_REGEX = re.compile(r"\s+")

PYTHON_DIR = 'D:\\Users\\Ritvik\\Anaconda3\\envs\\datascience\\lib'
count = 0
with open('E:/Scrapped-Data/MyPyBot/python_code.txt','a') as input_f:
    for path, directories, files in tqdm(os.walk(PYTHON_DIR)):
        for file in files:
            if file.endswith('.py'):
                try:
                    with open(os.path.join(path, file), 'r') as data_f:
                        contents = DOCSTRING_REGEX.sub('', data_f.read())
                        contents = COMMENT_REGEX.sub('', contents)
                        contents = NOT_WORDS_REGEX.sub(' ', contents)
                    input_f.write(EXTRAWHITESPACES_REGEX.sub(' ', contents))
                    input_f.write('\n')
                    count += 1
                except Exception as e:
                    print(os.path.join(path, file) ,str(e))
                    pass
                    
print(timestamp(), 'Data Extracted', count) 

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

D:\Users\Ritvik\Anaconda3\envs\datascience\lib\functools.py 'charmap' codec can't decode byte 0x81 in position 301: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\shlex.py 'charmap' codec can't decode byte 0x81 in position 1434: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\sre_compile.py 'charmap' codec can't decode byte 0x90 in position 1139: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\distutils\tests\test_archive_util.py 'charmap' codec can't decode byte 0x81 in position 2764: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\distutils\tests\test_log.py 'charmap' codec can't decode byte 0x8d in position 1113: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\distutils\tests\test_msvccompiler.py 'charmap' codec can't decode byte 0x81 in position 3315: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\pydoc_data\

D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\pandas\tests\io\json\test_readlines.py 'charmap' codec can't decode byte 0x9d in position 872: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\pandas\tests\io\msgpack\test_pack.py 'charmap' codec can't decode byte 0x81 in position 813: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\pandas\tests\io\parser\python_parser_only.py 'charmap' codec can't decode byte 0x9d in position 2790: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\pandas\tests\io\parser\test_read_fwf.py 'charmap' codec can't decode byte 0x9d in position 2864: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\pandas\tests\io\parser\usecols.py 'charmap' codec can't decode byte 0x81 in position 12643: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\panda

D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\spacy\lang\fa\tokenizer_exceptions.py 'charmap' codec can't decode byte 0x81 in position 1851: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\spacy\lang\fa\lemmatizer\_adjectives.py 'charmap' codec can't decode byte 0x81 in position 81: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\spacy\lang\fa\lemmatizer\_adjectives_exc.py 'charmap' codec can't decode byte 0x81 in position 986: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\spacy\lang\fa\lemmatizer\_lemma_rules.py 'charmap' codec can't decode byte 0x9d in position 1098: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\spacy\lang\fa\lemmatizer\_nouns.py 'charmap' codec can't decode byte 0x81 in position 344: character maps to <undefined>
D:\Users\Ritvik\Anaconda3\envs\datascience\lib\site-packages\spacy