# Data Preparation
Creates text data from the python libraries installed on your local system

Just change the PYTHON_DIR accordingly

In [2]:
import re, os
from tqdm.auto import tqdm
import time, datetime

def timestamp():
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S:')
    return st
print(timestamp(), 'Libraries Imported')

DOCSTRING_REGEX = re.compile(r"\"\"\"(.|\n)*?\"\"\"")
COMMENT_REGEX = re.compile(r"#.*")
NOT_WORDS_REGEX = re.compile(r"[^a-zA-Z]")
EXTRAWHITESPACES_REGEX = re.compile(r"\s+")

# virtual environments and installed packages
PYTHON_DIR = ['D:\\Users\\Ritvik\\Anaconda3\\envs\\datascience\\lib', 'D:\\Users\\Ritvik\\Anaconda3\\envs\\nlp_course\\lib',
             'D:\\Users\\Ritvik\\Anaconda3\\envs\\Pyradox\\lib', 'D://Users//Ritvik//Anaconda3//envs\\tensorflow\\lib', 
             'D:\\Users\\Ritvik\\Anaconda3\\envs\\tfdeeplearning\\lib', 'D:\\Users\\Ritvik\\Anaconda3\\pkgs']
unicode_count = 0
code = []

for DIR in [PYTHON_DIR[0]]:
    for path, directories, files in tqdm(os.walk(DIR)):
        for file in files:
            if file.endswith('.py'):
                try:
                    with open(os.path.join(path, file), 'r') as data_f:
                        contents = DOCSTRING_REGEX.sub(' ', data_f.read())
                        contents = COMMENT_REGEX.sub(' ', contents)
                        contents = NOT_WORDS_REGEX.sub(' ', contents)
                        contents = EXTRAWHITESPACES_REGEX.sub(' ', contents)
                        code.append(contents)
                except UnicodeDecodeError:
                    unicode_count += 1
                except Exception as e:
                    print(os.path.join(path, file) ,str(e))

print(timestamp(), 'Data Extracted')                    
print('Files read successfully', len(code))
code = list(set(code))
print('Unique files', len(code))
with open('E:/Scrapped-Data/MyPyBot/python_code.txt','w') as input_f:                        
    input_f.write('\n'.join(code))                    
print(timestamp(), 'Data Saved')

16:05:19: Libraries Imported


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


16:10:25: Data Extracted
Files read successfully 14676
Unique files 13357
16:10:31: Data Saved


# Language Model 

Trained on 
1. [Kaggle Kernel](https://www.kaggle.com/ritvik1909/mypybot) using data from the first directory only
2. [Google Colaoratory](https://colab.research.google.com/drive/1dqALTey2BD-jdxRFBDzJXp0Krw7_mIYy) using data from all directories


In [None]:
import os
import pandas as pd
from tqdm.auto import tqdm
import re
import dill
import time, datetime
from nltk import ngrams
from collections import Counter, defaultdict

def timestamp():
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%H:%M:%S:')
    return st
print(timestamp(), 'Libraries Imported')

with open('../input/python_code.txt','r') as f:
    data = f.read()
print(timestamp(), 'File Read', len(data))

class LanguageModel():
    def __init__(self, n, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol=''):
        self.model = defaultdict(lambda: defaultdict(lambda: 0))
        self.n = n
        self.pad_left= pad_left
        self.pad_right= pad_right
        self.left_pad_symbol= left_pad_symbol 
        self.right_pad_symbol= right_pad_symbol
        
    def train(self, dataString):
        word_grams = ngrams(dataString.split(), self.n+1,
                            pad_left=self.pad_left, pad_right=self.pad_right, 
                            left_pad_symbol=self.left_pad_symbol, right_pad_symbol=self.right_pad_symbol)
        for w in word_grams:
            self.model[w[:-1]][w[-1]] += 1
        for w1 in self.model:
            total_count = float(sum(self.model[w1].values()))
            for w2 in self.model[w1]:
                self.model[w1][w2] /= total_count
        droplist = []
        for w1 in self.model:
            for w2 in self.model[w1]:
                if self.model[w1][w2] < 0.1:
                    droplist.append((w1, w2))
        for w1, w2 in droplist:
            del self.model[w1][w2]
        del droplist
    def predict(self, query):
        try:
            DOCSTRING_REGEX = re.compile(r"\"\"\"(.|\n)*?\"\"\"")
            COMMENT_REGEX = re.compile(r"#.*")
            NOT_WORDS_REGEX = re.compile(r"[^a-zA-Z]")
            EXTRAWHITESPACES_REGEX = re.compile(r"\s+")
            query = DOCSTRING_REGEX.sub(' ', query)
            query = COMMENT_REGEX.sub(' ', query)
            query = NOT_WORDS_REGEX.sub(' ', query)
            query = EXTRAWHITESPACES_REGEX.sub(' ', query)
            if len(query.split()) >= 3:
                query = tuple(query.split()[-3:])
            else:
                query = tuple((['', '', '']+query.split())[-3:])
            return dict(self.model[query])
        except Exception as e:
            print(e)
            return ''

mypybot = LanguageModel(n=3)
mypybot.train(data)
print(timestamp(), 'Model Trained')
# dill.dump(mypybot, open('Model.pkl', 'wb'))
# print(timestamp(), 'Model Pickled')
print(mypybot.predict(('import numpy as')))
print(mypybot.predict(('import matplotlib.pyplot as')))