################################################################################
# 1. Preparations

In [7]:
%%capture
%pip install levenshtein

In [8]:
QPATH = "Quantlet/3-data-preprocessing"

In [9]:
# PREPARE WORKING DIRECTORY
import re
import random
import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [10]:
%%capture
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

# 2. Processing

In [11]:
DATE = "20231119_normal"
RS = 111

In [15]:
DATE1 = "20231104"
with open(
    f"../../data/preprocessed/Quantlet/{DATE1}/Quantlets_{DATE1}.pkl", "rb"
) as file:
    df = pickle.load(file)

In [16]:
df = df_metainfo_parse(df=df,
                    prepare_script=True,
                    remove_other=True,
                    remove_empty=False)

df = clean_up(df)
print(df.shape)

(5018, 6)
(5017, 12)


100%|██████████| 5017/5017 [00:29<00:00, 169.65it/s]
100%|██████████| 5017/5017 [00:57<00:00, 87.41it/s]   
100%|██████████| 5017/5017 [00:00<00:00, 12277.91it/s]
100%|██████████| 5017/5017 [00:01<00:00, 3179.84it/s]
100%|██████████| 5017/5017 [00:00<00:00, 933866.91it/s]

(5009, 13)





In [17]:
df['script_name_no_ext'] = df.script_name.str.split('.', expand=True)[0]
df['main_script'] = df['script_name_no_ext']==df['Quantlet']
df = df.loc[df['main_script']==True, :]

In [18]:
# ADDITIONAL PREPROCESSING OF DESCRIPTIONS

# remove parentheses
df.Description = df.Description.str.replace(r"\(.+?\)", "", regex=True)

# remove URL
df.Description = df.Description.str.replace(
r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""",
"",
regex=True)

# ADDITIONAL PREPROCESSING OF CODE
df.code_script = df.code_script.str.replace(r"#", "", regex=True)
df.loc[df.type_script == "m", "code_script"] = df.loc[df.type_script == "m", "code_script"].str.replace(r"\%", " ", regex=True)

df.loc[df.type_script == "r", "code_script"] = df.loc[df.type_script == "r", "code_script"].str.replace(r"\$", " ", regex=True)

# remove the same sign repeated more than 4 times
df.code_script = df.code_script.str.replace(r"(.)\1{4,}", r"\1", regex=True)

In [19]:
df['Description_ID'] = df.groupby('Description').ngroup()

In [20]:
df["Q_ID"] = df.index

folder_to_save = f"../../data/preprocessed/Quantlet/{DATE}/"
if not os.path.exists(folder_to_save):
    os.mkdir(folder_to_save)

#df.to_csv(f'{folder_to_save}full_{DATE}.csv', index=False)

In [21]:
#df = pd.read_csv(f'{folder_to_save}full_{DATE}.csv')

In [22]:
def shuffle_tokens(code_snippet):
    cs_list = code_snippet.split()
    shuffled_list = random.sample(cs_list, len(cs_list)) 
    return ' '.join(shuffled_list)

def remove_parentheses(code_snippet):
    regex = r'\w+'
    return ' '.join(re.findall(regex, code_snippet))

def remove_non_text(code_snippet):
    regex = r'\w+'
    cs_list = re.findall(regex, code_snippet)
    cs_list = [word for word in cs_list if (len(word)>1) and not word.isdigit()]
    return ' '.join(cs_list)

In [23]:
# CHANGE THE ORDER OF TOKENS
# Random Order
#df.code_script = df.code_script.apply(shuffle_tokens)

# Operands removal
#df.code_script = df.code_script.apply(remove_parentheses)

In [24]:
# SPLIT THE DATA GROUP QUANTLET
labelled_descr_id, test_descr_id = train_test_split(list(df.Description_ID.unique()),
                test_size=0.1,
                random_state=RS)
train_descr_id, val_descr_id = train_test_split(labelled_descr_id,
                test_size=0.1,
                random_state=RS)

In [25]:
full_train = df.loc[df.Description_ID.isin(labelled_descr_id)]
train = df.loc[df.Description_ID.isin(train_descr_id)]
val = df.loc[df.Description_ID.isin(val_descr_id)]
test = df.loc[df.Description_ID.isin(test_descr_id)]

In [27]:
save_datasets(full_train, train, val, test, "20231119_normal", RS, 'code_script', False, True)    

(2735, 17)
r     0.562706
m     0.231444
py    0.205850
Name: type_script, dtype: float64
(305, 17)
r     0.544262
py    0.229508
m     0.226230
Name: type_script, dtype: float64
(327, 17)
r     0.584098
m     0.247706
py    0.168196
Name: type_script, dtype: float64
(2735, 17)
r     1539
m      633
py     563
Name: type_script, dtype: int64
(305, 17)
r     166
py     70
m      69
Name: type_script, dtype: int64
(327, 17)
r     191
m      81
py     55
Name: type_script, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Authors"] = train["Authors"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["Authors"] = val["Authors"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a D

In [28]:
# create bootstrap
SIZE = test.shape[0]
indices = range(SIZE)
N_SAMPLES = 35

for sample in tqdm(range(1, N_SAMPLES)):
    np.random.seed(sample)
    sample_idx = np.random.choice(indices, size=SIZE, replace=True)
    sample_df = test.iloc[sample_idx, : ].reset_index(drop=True)
    sample_df.to_csv(f'../../data/preprocessed/Quantlet/20231119_normal/test_df_sample_{sample}.csv', index=False)

    group_test = sample_df
    test_dataset_json = {'version' : sample,
                        'data' : [{'input_sequence'  : group_test['code_script'].iloc[i],
                                'output_sequence'  : group_test['Description'].iloc[i]} for i in range(group_test.shape[0])]}
    with open(f'../../data/preprocessed/Quantlet/20231119_normal/test_dataset_sample_{sample}.json', 'w') as f:
        json.dump(test_dataset_json, f)

100%|██████████| 34/34 [00:01<00:00, 20.84it/s]
