################################################################################
# 1. Preparations

In [None]:
%pip install levenshtein

In [None]:
QPATH = "Quantlet/3-data-preprocessing"

In [None]:
# PREPARE WORKING DIRECTORY

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')
else:
  os.chdir('./')

#sys.path.append('../src')

In [None]:
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')

from tqdm import tqdm
tqdm.pandas()

import importlib
import preprocessing_utils
importlib.reload(preprocessing_utils)
from preprocessing_utils import *

#import importlib
#importlib.reload(preprocessing_utils)
#from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', None)

# 2. Processing

In [None]:
with open('../../data/preprocessed/Quantlet/Parsed_Qs_with_code_25062023.pkl', 'rb') as file:
  df = pickle.load(file)

In [None]:
RS = 42
CLEAN_ALL = False

In [None]:
df = df[df.metainfo_file!='empty']
print(df.shape)

# Parse metainfo file
if 'Keywords' not in df.columns:
  meta_info = pd.DataFrame(columns=['Quantlet', 'Description', 'Keywords', 'Other'])

  meta_info[['Quantlet', 'Description', 'Keywords', 'Authors', 'Other']] = df.apply(
      lambda x: parse_meta(x),
      axis='columns',
      result_type='expand'
      )

  for col in meta_info.columns:
      meta_info[col] = meta_info[col].astype(str)

  df = pd.concat([df, meta_info], axis=1)

  del df['metainfo_file']
  del df['Other']
  del df['script_name']
  del df['script_name_no_ext']

In [None]:
# PREPARE THE SCRIPT
df['multiple_scripts'] = df['code_script'].apply(lambda x: any(isinstance(i, list) for i in x))
df['code_script_joined'] = ''
df.loc[df['multiple_scripts']==True, 'code_script_joined'] = df.loc[df['multiple_scripts']==True, 'code_script'].apply(lambda x: [''.join(code_script) for code_script in x])
df.loc[df['multiple_scripts']!=True, 'code_script_joined'] = df.loc[df['multiple_scripts']!=True, 'code_script'].apply(lambda x: [''.join(x)])
df['scr_n'] = df['code_script_joined'].apply(len)
df['description_len'] = df['Description'].apply(len)
df['description_n_words'] = df['Description'].apply(lambda x: len(x.split()))
df = df.reset_index(drop=True)

df['main_script'] = np.nan
df['main_type_script'] = np.nan


# CREATE ONE LONG FORMAT DATASET
# one script - one description

df_long = explode_code_and_lang(df)

df_long['code_script'] = df_long['main_script'].progress_apply(lambda x: ''.join(x) if len(x)>0 else np.nan)
df_long['type_script'] = df_long['main_type_script']

del df_long['main_type_script']
del df_long['main_script']
del df_long['code_script_joined']

df_long = df_long[df_long['Description'].notna()]
df_long = df_long[df_long['code_script'].notna()]

# PREPARE IPYNB
df_long.loc[df_long['type_script'].apply(lambda x: isinstance(x, list)), 'type_script'] = 'py'
df_long.loc[df_long['type_script']=='ipynb', 'type_script'] = 'py'

In [None]:
# ANALYZE LENGTH OF THE CODE SNIPPET

df_long['code_len'] = df_long['code_script'].progress_apply(len)

# REMOVE CODE LINE DUPLICATES
df_long['code_script'] = df_long['code_script'].progress_apply(remove_dup_lines)
df_long['new_len'] = df_long['code_script'].progress_apply(len)

# REMOVE TOO SIMILAR LINES
# we want to get as much information
df_long['code_script'] = df_long['code_script'].progress_apply(remove_too_similar_line)
df_long['new_len'] = df_long['code_script'].progress_apply(len)

# REMOVE TOO SIMILAR TOKENS
df_long['code_script'] = df_long['code_script'].progress_apply(remove_too_similar_token)
df_long['new_len2'] = df_long['code_script'].progress_apply(len)

In [None]:
# ADD REPO INFORMATION
df_long['repo'] = df_long['folder_name'].str.split('QuantLet/', expand=True)[1].str.split('/', expand=True)[0]

In [None]:
#df_long['code_script'] = df_long['code_script'].progress_apply(cut_300)

In [None]:
if CLEAN_ALL:
    df_long['code_script'] = df_long['code_script'].progress_apply(greedy_clean)


In [None]:
# SPLIT THE DATA
labelled_qs, test_qs = train_test_split(list(df_long.Quantlet.unique()),
                                     test_size=0.1,
                                     random_state=RS)
train_qs, val_qs = train_test_split(labelled_qs,
                      test_size=0.1,
                      random_state=RS)


train = df_long[df_long['Quantlet'].isin(set(train_qs))]
val   = df_long[df_long['Quantlet'].isin(set(val_qs))]
test  = df_long[df_long['Quantlet'].isin(set(test_qs))]

In [None]:
train.to_csv('../../data/preprocessed/Quantlet/train_df_20230922.csv', index=False)
val.to_csv('../../data/preprocessed/Quantlet/val_df_20230922.csv', index=False)
test.to_csv('../../data/preprocessed/Quantlet/test_df_20230922.csv', index=False)


print(train.shape)
print(train['type_script'].value_counts(normalize=True))
print(val.shape)
print(val['type_script'].value_counts(normalize=True))
print(test.shape)
print(test['type_script'].value_counts(normalize=True))

In [None]:
train = pd.read_csv('../../data/preprocessed/Quantlet/train_df_20230922.csv')
val   = pd.read_csv('../../data/preprocessed/Quantlet/val_df_20230922.csv')
test  = pd.read_csv('../../data/preprocessed/Quantlet/test_df_20230922.csv')

In [None]:
add_repo = False
add_quantlet = False
add_aut  = True

In [None]:
# FIX NA
test.loc[test['Quantlet'].isna(), 'Quantlet'] = 'XFGexp_rtn_SRM_2d_DOENST RUN'
train['Authors'] = train['Authors'].fillna('Unknown')
val['Authors']   = val['Authors'].fillna('Unknown')
test['Authors']  = test['Authors'].fillna('Unknown')

In [None]:
if add_repo & add_aut & add_quantlet:
   train.loc[:,'code_script'] = 'Repo: ' + train['repo'] + 'Quantlet: ' + train['Quantlet'] + ' ; ' + ' ; ' + 'Author: ' + train['Authors'] + ' ; '  + train['code_script']
   val.loc[:,'code_script']   = 'Repo: ' + val['repo']   + 'Quantlet: ' + train['Quantlet'] + ' ; ' + ' ; ' + 'Author: ' + val['Authors']   + ' ; '  + val['code_script']
   test.loc[:,'code_script'] = 'Repo: ' + test['repo']  + 'Quantlet: ' + train['Quantlet'] + ' ; ' + ' ; ' + 'Author: ' + test['Authors']  + ' ; '  + test['code_script']

elif add_repo:
   train.loc[:,'code_script'] = 'Repo: ' + train['repo'] + ' ; ' + train['code_script']
   val.loc[:,'code_script']   = 'Repo: ' + val['repo'] + ' ; ' + val['code_script']
   test.loc[:,'code_script'] = 'Repo: ' + test['repo'] + ' ; ' + test['code_script']

elif add_quantlet:
   train.loc[:,'code_script'] = 'Quantlet: ' + train['Quantlet'] + ' ; ' + train['code_script']
   val.loc[:,'code_script']   = 'Quantlet: ' + val['Quantlet'] + ' ; ' + val['code_script']
   test.loc[:,'code_script'] = 'Quantlet: ' + test['Quantlet'] + ' ; ' + test['code_script']

elif add_aut:
   train.loc[:,'code_script'] = 'Author: ' + train['Authors'] + ' ; ' + train['code_script']
   val.loc[:,'code_script']   = 'Author: ' + val['Authors'] + ' ; ' + val['code_script']
   test.loc[:,'code_script'] = 'Author: ' + test['Authors'] + ' ; ' + test['code_script']

# 3. Save the Data

In [None]:
train_dataset_json = {'version' : '1.0',
                     'data' : [{'input_sequence'  : train['code_script'].iloc[i],
                                'output_sequence' : train['Description'].iloc[i]} for i in range(train.shape[0])]}
val_dataset_json = {'version' : '1.0',
                     'data' : [{'input_sequence'  : val['code_script'].iloc[i],
                                'output_sequence' : val['Description'].iloc[i]} for i in range(val.shape[0])]}

test_dataset_json = {'version' : '1.0',
                     'data' : [{'input_sequence'  : test['code_script'].iloc[i],
                                'output_sequence' : test['Description'].iloc[i]} for i in range(test.shape[0])]}


with open('../../data/preprocessed/Quantlet/train_dataset_a_20230922.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('../../data/preprocessed/Quantlet/val_dataset_a_20230922.json', 'w') as f:
    json.dump(val_dataset_json, f)

with open('../../data/preprocessed/Quantlet/test_dataset_a_20230922.json', 'w') as f:
    json.dump(test_dataset_json, f)