################################################################################
# 1. Preparations

In [None]:
%pip install levenshtein

In [None]:
QPATH = "Quantlet/3-data-preprocessing"

In [None]:
# PREPARE WORKING DIRECTORY

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')
else:
  os.chdir('./')

#sys.path.append('../src')

In [None]:
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')

from tqdm import tqdm
tqdm.pandas()

import importlib
import preprocessing_utils
importlib.reload(preprocessing_utils)
from preprocessing_utils import *

#import importlib
#importlib.reload(preprocessing_utils)
#from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', 500)

# 2. Processing

In [None]:
#with open('../../data/preprocessed/Quantlet/Parsed_Qs_with_code_25062023.pkl', 'rb') as file:
#  df = pickle.load(file)
with open('../../data/preprocessed/Quantlet/Qs_reduced_23092023.pkl', 'rb') as file:
  df = pickle.load(file)

In [None]:
RS = 42
CLEAN_ALL = False

In [None]:
df = df[df.metainfo_file!='empty']
print(df.shape)

# Parse metainfo file
if 'Keywords' not in df.columns:
  meta_info = pd.DataFrame(columns=['Quantlet', 'Description', 'Keywords', 'Other'])

  meta_info[['Quantlet', 'Description', 'Keywords', 'Authors', 'Other']] = df.apply(
      lambda x: parse_meta(x),
      axis='columns',
      result_type='expand'
      )

  for col in meta_info.columns:
      meta_info[col] = meta_info[col].astype(str)

  df = pd.concat([df, meta_info], axis=1)

  del df['metainfo_file']
  del df['Other']
  del df['script_name_no_ext']

In [None]:
# PREPARE THE SCRIPT
df['code_script'] = df['code_script'].apply(lambda x: [line for line in x if len(line)>0])
df['code_script'] = df['code_script'].apply(lambda x: ' '.join(x))


In [None]:
df['scr_n'] = df['code_script'].apply(len)
df['description_len'] = df['Description'].apply(len)
df['description_n_words'] = df['Description'].apply(lambda x: len(x.split()))
df = df.reset_index(drop=True)

# Reset Index
df_long = df.reset_index(drop=True)

In [None]:
# ADD REPO INFORMATION
df_long['repo'] = df_long['folder_name'].str.split('QuantLet/', expand=True)[1].str.split('/', expand=True)[0]

In [None]:
df_long.repo.value_counts()

# 4 groups

# no neighbors
# less than 5 neighbors
# between 5 and 10 neighbors
# more than 10 neighbors

In [None]:
# ANALYZE LENGTH OF THE CODE SNIPPET

df_long['code_len'] = df_long['code_script'].progress_apply(len)

# REMOVE CODE LINE DUPLICATES
df_long['code_script'] = df_long['code_script'].progress_apply(remove_dup_lines)
df_long['new_len'] = df_long['code_script'].progress_apply(len)

# REMOVE TOO SIMILAR LINES
# we want to get as much information
df_long['code_script'] = df_long['code_script'].progress_apply(remove_too_similar_line)
df_long['new_len'] = df_long['code_script'].progress_apply(len)

# REMOVE TOO SIMILAR TOKENS
df_long['code_script'] = df_long['code_script'].progress_apply(remove_too_similar_token)
df_long['new_len2'] = df_long['code_script'].progress_apply(len)

df_long = df_long.reset_index(drop=True)
df_long = df_long.drop(list(df_long[df_long['new_len2']==0].index)).reset_index(drop=True)

In [None]:
#df_long['code_script'] = df_long['code_script'].progress_apply(cut_300)

In [None]:
if CLEAN_ALL:
    df_long['code_script'] = df_long['code_script'].progress_apply(greedy_clean)

In [None]:
# SPLIT THE DATA
labelled_qs, test_qs = train_test_split(list(df_long.Quantlet.unique()),
                                     test_size=0.1,
                                     random_state=RS)
train_qs, val_qs = train_test_split(labelled_qs,
                      test_size=0.1,
                      random_state=RS)


train = df_long[df_long['Quantlet'].isin(set(train_qs))].reset_index(drop=True)
val   = df_long[df_long['Quantlet'].isin(set(val_qs))].reset_index(drop=True)
test  = df_long[df_long['Quantlet'].isin(set(test_qs))].reset_index(drop=True)

full_train = pd.concat([train, test], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)

In [None]:
full_train.to_csv('../../data/preprocessed/Quantlet/full_train_df_20230923.csv', index=False)
train.to_csv('../../data/preprocessed/Quantlet/train_df_20230923.csv', index=False)
val.to_csv('../../data/preprocessed/Quantlet/val_df_20230923.csv', index=False)
test.to_csv('../../data/preprocessed/Quantlet/test_df_20230923.csv', index=False)


print(train.shape)
print(train['type_script'].value_counts(normalize=True))
print(val.shape)
print(val['type_script'].value_counts(normalize=True))
print(test.shape)
print(test['type_script'].value_counts(normalize=True))

In [None]:
for MODE in ['no_context', 'author', 'repo', 'both']:
    full_train = pd.read_csv('../../data/preprocessed/Quantlet/full_train_df_20230923.csv')
    train = pd.read_csv('../../data/preprocessed/Quantlet/train_df_20230923.csv')
    val   = pd.read_csv('../../data/preprocessed/Quantlet/val_df_20230923.csv')
    test  = pd.read_csv('../../data/preprocessed/Quantlet/test_df_20230923.csv')

    # FIX NA
    test.loc[test['Quantlet'].isna(), 'Quantlet'] = 'XFGexp_rtn_SRM_2d_DOENST RUN'
    train['Authors'] = train['Authors'].fillna('Unknown')
    val['Authors']   = val['Authors'].fillna('Unknown')
    test['Authors']  = test['Authors'].fillna('Unknown')


    if MODE=='both':
      train.loc[:,'code_script'] = 'Repo: ' + train['repo'] + ' ; ' + ' ; ' + 'Author: ' + train['Authors'] + ' ; '  + train['code_script']
      val.loc[:,'code_script']   = 'Repo: ' + val['repo']   + ' ; ' + ' ; ' + 'Author: ' + val['Authors']   + ' ; '  + val['code_script']
      test.loc[:,'code_script'] = 'Repo: ' + test['repo']  + ' ; ' + ' ; ' + 'Author: ' + test['Authors']  + ' ; '  + test['code_script']

    elif MODE=='repo':
      train.loc[:,'code_script'] = 'Repo: ' + train['repo'] + ' ; ' + train['code_script']
      val.loc[:,'code_script']   = 'Repo: ' + val['repo'] + ' ; ' + val['code_script']
      test.loc[:,'code_script'] = 'Repo: ' + test['repo'] + ' ; ' + test['code_script']

    #elif add_quantlet:
    #   train.loc[:,'code_script'] = 'Quantlet: ' + train['Quantlet'] + ' ; ' + train['code_script']
    #   val.loc[:,'code_script']   = 'Quantlet: ' + val['Quantlet'] + ' ; ' + val['code_script']
    #   test.loc[:,'code_script'] = 'Quantlet: ' + test['Quantlet'] + ' ; ' + test['code_script']

    elif MODE=='author':
      train.loc[:,'code_script'] = 'Author: ' + train['Authors'] + ' ; ' + train['code_script']
      val.loc[:,'code_script']   = 'Author: ' + val['Authors'] + ' ; ' + val['code_script']
      test.loc[:,'code_script'] = 'Author: ' + test['Authors'] + ' ; ' + test['code_script']

    train_dataset_json = {'version' : '1.0',
                        'data' : [{'input_sequence'  : train['code_script'].iloc[i],
                                    'output_sequence' : train['Description'].iloc[i]} for i in range(train.shape[0])]}
    val_dataset_json = {'version' : '1.0',
                        'data' : [{'input_sequence'  : val['code_script'].iloc[i],
                                    'output_sequence' : val['Description'].iloc[i]} for i in range(val.shape[0])]}

    full_train_dataset_json = {'version' : '1.0',
                                'data' : [{'input_sequence'  : full_train['code_script'].iloc[i],
                                            'output_sequence' : full_train['Description'].iloc[i]} for i in range(full_train.shape[0])]}

    test_dataset_json = {'version' : '1.0',
                        'data' : [{'input_sequence'  : test['code_script'].iloc[i],
                                    'output_sequence' : test['Description'].iloc[i]} for i in range(test.shape[0])]}


    with open(f'../../data/preprocessed/Quantlet/{MODE}/full_train_dataset_20230923.json', 'w') as f:
        json.dump(full_train_dataset_json, f)

    with open(f'../../data/preprocessed/Quantlet/{MODE}/train_dataset_20230923.json', 'w') as f:
        json.dump(train_dataset_json, f)

    with open(f'../../data/preprocessed/Quantlet/{MODE}/val_dataset_20230923.json', 'w') as f:
        json.dump(val_dataset_json, f)

    with open(f'../../data/preprocessed/Quantlet/{MODE}/test_dataset_20230923.json', 'w') as f:
        json.dump(test_dataset_json, f)