################################################################################
# 1. Preparations

In [3]:
%%capture
%pip install levenshtein

In [4]:
QPATH = "Quantlet/3-data-preprocessing"

In [5]:
# PREPARE WORKING DIRECTORY

import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [6]:
%%capture
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

# 2. Processing

In [7]:
DATE = "20231104"
RS = 111

In [8]:
with open(
    f"../../data/preprocessed/Quantlet/{DATE}/Quantlets_{DATE}.pkl", "rb"
) as file:
    df = pickle.load(file)

In [9]:
df = df_metainfo_parse(df=df,
                    prepare_script=True,
                    remove_other=True,
                    remove_empty=False)

df = clean_up(df)
print(df.shape)

(5018, 6)
(5017, 12)


100%|██████████| 5017/5017 [00:39<00:00, 125.48it/s]
100%|██████████| 5017/5017 [01:04<00:00, 77.71it/s]  
100%|██████████| 5017/5017 [00:00<00:00, 9213.52it/s] 
100%|██████████| 5017/5017 [00:01<00:00, 2572.92it/s]
100%|██████████| 5017/5017 [00:00<00:00, 399559.92it/s]

(5009, 13)





In [10]:
df['script_name_no_ext'] = df.script_name.str.split('.', expand=True)[0]
df['main_script'] = df['script_name_no_ext']==df['Quantlet']
df = df.loc[df['main_script']==True, :]

In [11]:
# ADDITIONAL PREPROCESSING OF DESCRIPTIONS

# remove parentheses
df.Description = df.Description.str.replace(r"\(.+?\)", "", regex=True)

# remove URL
df.Description = df.Description.str.replace(
r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""",
"",
regex=True)

# ADDITIONAL PREPROCESSING OF CODE
df.code_script = df.code_script.str.replace(r"#", "", regex=True)
df.loc[df.type_script == "m", "code_script"] = df.loc[df.type_script == "m", "code_script"].str.replace(r"\%", " ", regex=True)

df.loc[df.type_script == "r", "code_script"] = df.loc[df.type_script == "r", "code_script"].str.replace(r"\$", " ", regex=True)

# remove the same sign repeated more than 4 times
df.code_script = df.code_script.str.replace(r"(.)\1{4,}", r"\1", regex=True)

In [12]:
df['Description_ID'] = df.groupby('Description').ngroup()

In [13]:
'''from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                        model="facebook/bart-large-mnli")
candidate_labels = ['project-level', 'codesnippet-level']
descriptions = df.Description.unique()
descriptions[0]
classifier("Plots the power curves by 2SQR(1), 2SQR(2) and IVX-QR in simulation.'", candidate_labels)'''

'from transformers import pipeline\nclassifier = pipeline("zero-shot-classification",\n                        model="facebook/bart-large-mnli")\ncandidate_labels = [\'project-level\', \'codesnippet-level\']\ndescriptions = df.Description.unique()\ndescriptions[0]\nclassifier("Plots the power curves by 2SQR(1), 2SQR(2) and IVX-QR in simulation.\'", candidate_labels)'

In [14]:
'''# CHUNKING
df[['chunk_ids', 'chunks']] = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')
chunks_df = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')
chunks_df.columns = ['chunk_ids', 'chunks']
df[['chunk_ids', 'chunks']]  = chunks_df[['chunk_ids', 'chunks']] 
df = df.explode('chunk_ids').reset_index(drop=True)
df['chunks'] = df.apply(lambda x: x['chunks'][x['chunk_ids']], axis=1)
df.shape'''

"# CHUNKING\ndf[['chunk_ids', 'chunks']] = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')\nchunks_df = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')\nchunks_df.columns = ['chunk_ids', 'chunks']\ndf[['chunk_ids', 'chunks']]  = chunks_df[['chunk_ids', 'chunks']] \ndf = df.explode('chunk_ids').reset_index(drop=True)\ndf['chunks'] = df.apply(lambda x: x['chunks'][x['chunk_ids']], axis=1)\ndf.shape"

In [15]:
df["Q_ID"] = df.index

folder_to_save = f"../../data/preprocessed/Quantlet/{DATE}/"
if not os.path.exists(folder_to_save):
    os.mkdir(folder_to_save)

#df.to_csv(f'{folder_to_save}full_{DATE}.csv', index=False)

In [16]:
#df = pd.read_csv(f'{folder_to_save}full_{DATE}.csv')

'''code_list = df.groupby(['folder_name'])['code_script'].apply(list)
df['code_script'] = df['folder_name'].map(code_list)
df['code_script'] = df['code_script'].apply(lambda x: '\n\n'.join(x))'''

'''code_list = df.groupby(['folder_name'])['type_script'].apply(list)
df['type_script'] = df['folder_name'].map(code_list)
df['type_script'] = df['type_script'].apply(lambda x: ' '.join(x))'''

"code_list = df.groupby(['folder_name'])['type_script'].apply(list)\ndf['type_script'] = df['folder_name'].map(code_list)\ndf['type_script'] = df['type_script'].apply(lambda x: ' '.join(x))"

In [17]:
'''print(df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'Q_ID', 'folder_name', 'repo', 'Authors']].shape)

df_reduced = df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'type_script', 'folder_name', 'repo', 'Authors']].drop_duplicates(['Quantlet', 'Description', 'Description_ID', 'folder_name', 'repo', 'Authors'])
df_reduced.shape'''

"print(df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'Q_ID', 'folder_name', 'repo', 'Authors']].shape)\n\ndf_reduced = df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'type_script', 'folder_name', 'repo', 'Authors']].drop_duplicates(['Quantlet', 'Description', 'Description_ID', 'folder_name', 'repo', 'Authors'])\ndf_reduced.shape"

In [18]:
# SPLIT THE DATA GROUP QUANTLET
labelled_descr_id, test_descr_id = train_test_split(list(df.Description_ID.unique()),
                test_size=0.1,
                random_state=RS)
train_descr_id, val_descr_id = train_test_split(labelled_descr_id,
                test_size=0.1,
                random_state=RS)

In [19]:
full_train = df.loc[df.Description_ID.isin(labelled_descr_id)]
train = df.loc[df.Description_ID.isin(train_descr_id)]
val = df.loc[df.Description_ID.isin(val_descr_id)]
test = df.loc[df.Description_ID.isin(test_descr_id)]

In [40]:
test.type_script.value_counts()

r     191
m      81
py     55
Name: type_script, dtype: int64

In [21]:
c_len_t = train.Description.apply(lambda x: len(x.split()))
c_len_t.describe()

count    2735.000000
mean       25.179159
std        20.542373
min         1.000000
25%        13.000000
50%        19.000000
75%        29.000000
max       175.000000
Name: Description, dtype: float64

In [None]:
save_datasets(full_train, train, val, test, DATE, RS, 'code_script', False, True)    

In [32]:
# create bootstrap
SIZE = test.shape[0]
indices = range(SIZE)
N_SAMPLES = 35

for sample in tqdm(range(1, N_SAMPLES)):
    np.random.seed(sample)
    sample_idx = np.random.choice(indices, size=SIZE, replace=True)
    sample_df = test.iloc[sample_idx, : ].reset_index(drop=True)
    sample_df.to_csv(f'../../data/preprocessed/Quantlet/{DATE}/test_df_sample_{sample}.csv', index=False)

    # PROGRAMMING LANGUAGE
    for type_script in sample_df.type_script.unique():
      group_test = sample_df.loc[sample_df.type_script == type_script, : ]
      test_dataset_json = {'version' : type_script,
                          'data' : [{'input_sequence'  : group_test['code_script'].iloc[i],
                                    'output_sequence'  : group_test['Description'].iloc[i]} for i in range(group_test.shape[0])]}
      with open(f'../../data/preprocessed/Quantlet/{DATE}/test_dataset_{type_script}_sample_{sample}.json', 'w') as f:
        json.dump(test_dataset_json, f)

100%|██████████| 34/34 [00:02<00:00, 13.67it/s]


In [33]:
# create bootstrap
SIZE = val.shape[0]
indices = range(SIZE)
N_SAMPLES = 35

for sample in tqdm(range(1, N_SAMPLES)):
    np.random.seed(sample)
    sample_idx = np.random.choice(indices, size=SIZE, replace=True)
    sample_df = val.iloc[sample_idx, : ].reset_index(drop=True)
    sample_df.to_csv(f'../../data/preprocessed/Quantlet/{DATE}/val_df_sample_{sample}.csv', index=False)

    # PROGRAMMING LANGUAGE
    for type_script in sample_df.type_script.unique():
      group_val = sample_df.loc[sample_df.type_script == type_script, : ]
      val_dataset_json = {'version' : type_script,
                          'data' : [{'input_sequence'  : group_val['code_script'].iloc[i],
                                    'output_sequence'  : group_val['Description'].iloc[i]} for i in range(group_val.shape[0])]}
      with open(f'../../data/preprocessed/Quantlet/{DATE}/val_dataset_{type_script}_sample_{sample}.json', 'w') as f:
        json.dump(val_dataset_json, f)

100%|██████████| 34/34 [00:02<00:00, 14.01it/s]


In [32]:
# FEW SHOT RANDOM

In [None]:
few_shot = train.groupby('type_script').sample(n=35, random_state=RS)
few_shot_random_ids = few_shot.Description_ID
not_few_shot_random_ids = set(train.Description_ID.values).difference(set(few_shot.Description_ID.values))

In [37]:
few_shot_train_df = train.copy(deep=True)
few_shot_train_df = few_shot_train_df.loc[~few_shot_train_df.Description_ID.isin(not_few_shot_random_ids)]
print(few_shot_train_df.shape)

few_shot_full_train_df = full_train.copy(deep=True)
few_shot_full_train_df = few_shot_full_train_df.loc[~few_shot_full_train_df.Description_ID.isin(not_few_shot_random_ids)]
print(few_shot_full_train_df.shape)

(195, 17)
(500, 17)


In [38]:
fs_train_dataset_json = {'version' : '0',
                        'data' : [{'input_sequence'  : few_shot_train_df['code_script'].iloc[i],
                                    'output_sequence'  : few_shot_train_df['Description'].iloc[i]} for i in range(few_shot_train_df.shape[0])]}
with open(f'../../data/preprocessed/Quantlet/{DATE}/fs_train_dataset_sample_0.json', 'w') as f:
    json.dump(fs_train_dataset_json, f)

In [39]:
fs_full_train_dataset_json = {'version' : '0',
                        'data' : [{'input_sequence'  : few_shot_full_train_df['code_script'].iloc[i],
                                    'output_sequence'  : few_shot_full_train_df['Description'].iloc[i]} for i in range(few_shot_full_train_df.shape[0])]}
with open(f'../../data/preprocessed/Quantlet/{DATE}/fs_full_train_dataset_sample_0.json', 'w') as f:
    json.dump(fs_full_train_dataset_json, f)