################################################################################
# 1. Preparations

In [24]:
%%capture
%pip install levenshtein

In [25]:
QPATH = "Quantlet/3-data-preprocessing"

In [26]:
# PREPARE WORKING DIRECTORY

import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [213]:
%%capture
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

# 2. Processing

In [186]:
DATE = "20231027"
RS = 111

In [187]:
with open(
    f"../../data/preprocessed/Quantlet/20231027/Quantlets_20231027.pkl", "rb"
) as file:
    df = pickle.load(file)

In [188]:
df = df_metainfo_parse(df=df,
                    prepare_script=True,
                    remove_other=True,
                    remove_empty=False)

df = clean_up(df)
print(df.shape)

(5018, 6)
(5017, 12)


100%|██████████| 5017/5017 [00:34<00:00, 144.80it/s]
100%|██████████| 5017/5017 [00:59<00:00, 84.22it/s]  
100%|██████████| 5017/5017 [00:00<00:00, 9289.75it/s] 
100%|██████████| 5017/5017 [00:01<00:00, 2516.96it/s]
100%|██████████| 5017/5017 [00:00<00:00, 859311.63it/s]

(5009, 13)





In [190]:
# ADDITIONAL PREPROCESSING OF DESCRIPTIONS

# remove parentheses
df.Description = df.Description.str.replace(r"\(.+?\)", "", regex=True)

# remove URL
df.Description = df.Description.str.replace(
r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""",
"",
regex=True)

# ADDITIONAL PREPROCESSING OF CODE
df.code_script = df.code_script.str.replace(r"#", "", regex=True).str.replace(r"\n", " ", regex=True)
df.loc[df.type_script == "m", "code_script"] = df.loc[df.type_script == "m", "code_script"].str.replace(r"\%", " ", regex=True)

df.loc[df.type_script == "r", "code_script"] = df.loc[df.type_script == "r", "code_script"].str.replace(r"\$", " ", regex=True)

# remove the same sign repeated more than 4 times
df.code_script = df.code_script.str.replace(r"(.)\1{4,}", r"\1", regex=True)
df.code_script = df.code_script.str.replace("\s{2,}", "", regex=True).str.strip()

In [191]:
# CHUNKING
df[['chunk_ids', 'chunks']] = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')
chunks_df = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')
chunks_df.columns = ['chunk_ids', 'chunks']
df[['chunk_ids', 'chunks']]  = chunks_df[['chunk_ids', 'chunks']] 
df = df.explode('chunk_ids').reset_index(drop=True)
df['chunks'] = df.apply(lambda x: x['chunks'][x['chunk_ids']], axis=1)
df.shape

In [199]:
df['Description_ID'] = df.groupby('Description').ngroup()

In [203]:
df["Q_ID"] = df.index

df.to_csv(f"../../data/preprocessed/Quantlet/{DATE}/full_{DATE}.csv", index=False)

In [204]:
# SPLIT THE DATA GROUP QUANTLET
labelled_descr_id, test_descr_id = train_test_split(list(df.Description_ID.unique()),
                test_size=0.1,
                random_state=RS)
train_descr_id, val_descr_id = train_test_split(labelled_descr_id,
                test_size=0.1,
                random_state=RS)

In [211]:
full_train = df.loc[df.Description_ID.isin(labelled_descr_id)]
train = df.loc[df.Description_ID.isin(train_descr_id)]
val = df.loc[df.Description_ID.isin(val_descr_id)]
test = df.loc[df.Description_ID.isin(test_descr_id)]

In [214]:
save_datasets(full_train, train, val, test, DATE, RS, 'chunks')    

(9679, 17)
r     0.541275
py    0.285257
m     0.173468
Name: type_script, dtype: float64
(877, 17)
r     0.567845
py    0.239453
m     0.192702
Name: type_script, dtype: float64
(852, 17)
r     0.557512
py    0.272300
m     0.170188
Name: type_script, dtype: float64
(9679, 17)
r     5239
py    2761
m     1679
Name: type_script, dtype: int64
(877, 17)
r     498
py    210
m     169
Name: type_script, dtype: int64
(852, 17)
r     475
py    232
m     145
Name: type_script, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Authors"] = train["Authors"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["Authors"] = val["Authors"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a D