################################################################################
# 1. Preparations

In [None]:
%pip install levenshtein

In [None]:
QPATH = "Quantlet/Data_preprocessing"

In [None]:
# PREPARE WORKING DIRECTORY

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')
else:
  os.chdir('./')

sys.path.append('../src')

In [None]:
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')

from tqdm import tqdm
tqdm.pandas()
import preprocessing_utils

import importlib
importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', None)

# 2. Processing

In [None]:
with open('../../data/preprocessed/Quantlet/Parsed_Qs_with_code_25062023.pkl', 'rb') as file:
  df = pickle.load(file)

In [None]:
RS = 42

In [None]:
df = df[df.metainfo_file!='empty']
print(df.shape)

In [None]:
# Parse metainfo file
if 'Keywords' not in df.columns:
  meta_info = pd.DataFrame(columns=['Quantlet', 'Description', 'Keywords', 'Other'])

  meta_info[['Quantlet', 'Description', 'Keywords', 'Authors', 'Other']] = df.apply(
      lambda x: parse_meta(x),
      axis='columns',
      result_type='expand'
      )

  for col in meta_info.columns:
      meta_info[col] = meta_info[col].astype(str)

  df = pd.concat([df, meta_info], axis=1)

  del df['metainfo_file']
  del df['Other']
  del df['script_name']
  del df['script_name_no_ext']

In [None]:
df['multiple_scripts'] = df['code_script'].apply(lambda x: any(isinstance(i, list) for i in x))
df['code_script_joined'] = ''
df.loc[df['multiple_scripts']==True, 'code_script_joined'] = df.loc[df['multiple_scripts']==True, 'code_script'].apply(lambda x: [''.join(code_script) for code_script in x])
df.loc[df['multiple_scripts']!=True, 'code_script_joined'] = df.loc[df['multiple_scripts']!=True, 'code_script'].apply(lambda x: [''.join(x)])
df['scr_n'] = df['code_script_joined'].apply(len)
df['description_len'] = df['Description'].apply(len)
df['description_n_words'] = df['Description'].apply(lambda x: len(x.split()))
df = df.reset_index(drop=True)

In [None]:
df['main_script'] = np.nan
df['main_type_script'] = np.nan

In [None]:
def explode_code_and_lang(df):
    new_df = pd.DataFrame()

    print(f'Shape before exploding scripts: {df.shape}')

    for index, row in tqdm(df.iterrows()):
        if row['multiple_scripts']==True:
          for i, script in enumerate(row['code_script']):
              row['main_script'] = script
              row['main_type_script'] = row['type_script'][i]
              new_df = new_df.append(row)
        else:
          new_df = new_df.append(row)

    new_df['main_script'] = new_df['main_script'].fillna(new_df['code_script'])
    new_df['main_type_script'] = new_df['main_type_script'].fillna(new_df['type_script'])

    new_df = new_df.reset_index(drop=True)
    print(f'Shape after exploding scripts: {new_df.shape}')
    return new_df


In [None]:
df_long = explode_code_and_lang(df)

df_long['code_script'] = df_long['main_script'].progress_apply(lambda x: ''.join(x) if len(x)>0 else np.nan)
df_long['type_script'] = df_long['main_type_script']

del df_long['main_type_script']
del df_long['main_script']
del df_long['code_script_joined']

df_long = df_long[df_long['Description'].notna()]
df_long = df_long[df_long['code_script'].notna()]

In [None]:
def add_docstring_comment_tags_py(string):
    result = string.replace('\r', '')
    s_com = re.compile(r"(#*)(.*)\n")
    s_m = re.compile(r'("""|\'\'\')(.*?)\1', re.DOTALL)

    result = re.sub(s_com, r"<COMMENT S> \2 <COMMENT E>\n", result, re.DOTALL)
    result = re.sub(s_m, r'<DOCSTR START>\2<DOCSTR END>\n', result, re.DOTALL)
    return result

def add_docstring_comment_tags_r(string):
    result = string.replace('\r', '')
    s_com = re.compile(r"(#*)(.*)\n")
    s_m = re.compile(r"#'\n(.*?)\n#'", re.DOTALL)

    result = re.sub(s_com, r"<COMMENT S> \2 <COMMENT E>\n", result, re.DOTALL)
    result = re.sub(s_m, r'<DOCSTR START>\1<DOCSTR END>\n', result, re.DOTALL)
    return result

def add_docstring_comment_tags_matlab(string):
    result = string.replace('\r', '')
    s_com = re.compile(r"(%*)(.*)")
    s_m = re.compile(r"%\{\n(.*?)\n%\}", re.DOTALL)

    result = re.sub(s_com, r"<COMMENT S> \2 <COMMENT E>\n", result, re.DOTALL)
    result = re.sub(s_m, r'<DOCSTR START>\1<DOCSTR END>\n', result, re.DOTALL)
    return result

def add_docstring_comment_tags(string, lang):
    if lang=='py':
       result = add_docstring_comment_tags_py(string)
    elif lang=='m':
       result = add_docstring_comment_tags_matlab(string)
    elif lang=='r':
       result = add_docstring_comment_tags_r(string)
    return result

In [None]:
df_long.loc[df_long['type_script'].apply(lambda x: isinstance(x, list)), 'type_script'] = 'py'
df_long.loc[df_long['type_script']=='ipynb', 'type_script'] = 'py'

In [None]:
# ANALYZE LENGTH OF THE CODE SNIPPET

df_long['code_len'] = df_long['code_script'].progress_apply(len)

In [None]:
# remove duplicate lines
def remove_dup_lines(row):
  cleaned_up = []
  codes_list = row.split('\n')
  for cl in codes_list:
    if cl in cleaned_up:
      continue
    else:
      cleaned_up.append(cl)

  return '\n'.join(cleaned_up)

In [None]:
df_long['code_script'] = df_long['code_script'].progress_apply(remove_dup_lines)
df_long['new_len'] = df_long['code_script'].progress_apply(len)

In [None]:
def remove_too_similar_line(row, inf_gain=0.4):
    code_splitted = row.split('\n')
    cleaned_up = []
    for i, code_line in enumerate(code_splitted):
        if i==0:
            cleaned_up.append(code_line)
        else:
            levenshtein_distance = distance(code_line, cleaned_up[-1])
            if levenshtein_distance / len(cleaned_up[-1])>=inf_gain:
                cleaned_up.append(code_line)
    return '\n'.join(cleaned_up)

def remove_too_similar_token(row, inf_gain=0.4):
    code_splitted = row.split('\n')
    cleaned_up = []
    for i, code_line in enumerate(code_splitted):
        tokenized = code_line.split()
        new_line = []
        for j, token in enumerate(tokenized):
            if j==0:
                new_line.append(token)
            else:
                levenshtein_distance = distance(token, new_line[-1])
                if levenshtein_distance / len(new_line[-1])>=inf_gain:
                    new_line.append(token)
        new_line = ' '.join(new_line)
        cleaned_up.append(new_line)
    return '\n'.join(cleaned_up)

In [None]:
df_long['code_script'] = df_long['code_script'].progress_apply(remove_too_similar_line)
df_long['new_len'] = df_long['code_script'].progress_apply(len)

In [None]:
df_long['code_script'] = df_long['code_script'].progress_apply(remove_too_similar_token)
df_long['new_len2'] = df_long['code_script'].progress_apply(len)

In [None]:
def cut_300(row):
    tokenized = row.split()
    return ' '.join(tokenized[:2500])

In [None]:
#df_long['code_script'] = df_long['code_script'].progress_apply(cut_300)

In [None]:
def greedy_clean(code_snippet):
  code_snippet = re.sub('\W+', ' ', code_snippet).strip()
  cleaned_up = [word for word in code_snippet.split() if len(word)>2]
  return ' '.join(cleaned_up)

In [None]:
CLEAN_ALL = True

In [None]:
if CLEAN_ALL:
    df_long['code_script'] = df_long['code_script'].progress_apply(greedy_clean)

In [None]:
labelled_qs, test_qs = train_test_split(list(df_long.Quantlet.unique()),
                                     test_size=0.1,
                                     random_state=RS)
train_qs, val_qs = train_test_split(labelled_qs,
                      test_size=0.1,
                      random_state=RS)


train = df_long[df_long['Quantlet'].isin(set(train_qs))]
val   = df_long[df_long['Quantlet'].isin(set(val_qs))]
test  = df_long[df_long['Quantlet'].isin(set(test_qs))]

In [None]:
train.to_csv('../../data/preprocessed/Quantlet/train_df_hard_clean.csv', index=False)
val.to_csv('../../data/preprocessed/Quantlet/val_df_hard_clean.csv', index=False)
test.to_csv('../../data/preprocessed/Quantlet/test_df_hard_clean.csv', index=False)

In [None]:
print(train.shape)
print(train['type_script'].value_counts(normalize=True))

In [None]:
print(val.shape)
print(val['type_script'].value_counts(normalize=True))

In [None]:
print(test.shape)
print(test['type_script'].value_counts(normalize=True))

In [None]:
train = pd.read_csv('../../data/preprocessed/Quantlet/train_df_aut.csv')
val   = pd.read_csv('../../data/preprocessed/Quantlet/val_df_aut.csv')
test  = pd.read_csv('../../data/preprocessed/Quantlet/test_df_aut.csv')

In [None]:
add_repo = False
add_aut  = False

In [None]:
# FIX NA
test.loc[test['Quantlet'].isna(), 'Quantlet'] = 'XFGexp_rtn_SRM_2d_DOENST RUN'
train['Authors'] = train['Authors'].fillna('Unknown')
val['Authors']   = val['Authors'].fillna('Unknown')
test['Authors']  = test['Authors'].fillna('Unknown')

In [None]:
if add_repo & add_aut:
   train.loc[:,'code_script'] = 'Repo: ' + train['Quantlet'] + ' ; ' + 'Author: ' + train['Authors'] + ' ; '  + train['code_script']
   val.loc[:,'code_script']   = 'Repo: ' + val['Quantlet']   + ' ; ' + 'Author: ' + val['Authors']   + ' ; '  + val['code_script']
   test.loc[:,'code_script'] = 'Repo: ' + test['Quantlet']  + ' ; ' + 'Author: ' + test['Authors']  + ' ; '  + test['code_script']

elif add_repo:
   train.loc[:,'code_script'] = 'Repo: ' + train['Quantlet'] + ' ; ' + train['code_script']
   val.loc[:,'code_script']   = 'Repo: ' + val['Quantlet'] + ' ; ' + val['code_script']
   test.loc[:,'code_script'] = 'Repo: ' + test['Quantlet'] + ' ; ' + test['code_script']

elif add_aut:
   train.loc[:,'code_script'] = 'Author: ' + train['Authors'] + ' ; ' + train['code_script']
   val.loc[:,'code_script']   = 'Author: ' + val['Authors'] + ' ; ' + val['code_script']
   test.loc[:,'code_script'] = 'Author: ' + test['Authors'] + ' ; ' + test['code_script']

# 3. Save the Data

In [None]:
train_dataset_json = {'version' : '0.6',
                     'data' : [{'input_sequence'  : train['code_script'].iloc[i],
                                'output_sequence' : train['Description'].iloc[i]} for i in range(train.shape[0])]}
val_dataset_json = {'version' : '0.6',
                     'data' : [{'input_sequence'  : val['code_script'].iloc[i],
                                'output_sequence' : val['Description'].iloc[i]} for i in range(val.shape[0])]}

test_dataset_json = {'version' : '0.6',
                     'data' : [{'input_sequence'  : test['code_script'].iloc[i],
                                'output_sequence' : test['Description'].iloc[i]} for i in range(test.shape[0])]}


with open('../../data/preprocessed/Quantlet/train_dataset_hard_clean.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('../../data/preprocessed/Quantlet/val_dataset_hard_clean.json', 'w') as f:
    json.dump(val_dataset_json, f)

with open('../../data/preprocessed/Quantlet/test_dataset_hard_clean.json', 'w') as f:
    json.dump(test_dataset_json, f)