################################################################################
# 1. Preparations

In [25]:
%%capture
%pip install levenshtein

In [26]:
QPATH = "Quantlet/3-data-preprocessing"

In [27]:
# PREPARE WORKING DIRECTORY

import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [30]:
%%capture
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. Processing

In [32]:
DATE = "20231027"
RS = 111

In [35]:
with open(
    f"../../data/preprocessed/Quantlet/20231027/Quantlets_20231027.pkl", "rb"
) as file:
    df = pickle.load(file)

In [36]:
df_long = df_metainfo_parse(df=df,
                            prepare_script=True,
                            remove_other=True,
                            remove_empty=False)

(5018, 6)
(5017, 12)


In [37]:
df_long = clean_up(df_long)
print(df_long.shape)

100%|██████████| 5017/5017 [00:00<00:00, 350316.69it/s]
 41%|████▏     | 2073/5017 [00:40<00:57, 51.54it/s]  


KeyboardInterrupt: 

In [20]:
df_long['len'] = df_long.code_script.apply(lambda x: len(x.split()))

In [None]:
def

In [24]:
df_long['chunk_id']

(3446, 16)

In [11]:
n_sentences = df_long["Description"].progress_apply(lambda descr: len(descr.split(".")))

100%|██████████| 4828/4828 [00:00<00:00, 356083.27it/s]


In [12]:
# ADDITIONAL PREPROCESSING OF DESCRIPTIONS

# remove parentheses
df_long.Description = df_long.Description.str.replace(r"\(.+?\)", "", regex=True)

# remove URL
df_long.Description = df_long.Description.str.replace(
r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""",
"",
regex=True)

# ADDITIONAL PREPROCESSING OF CODE
df_long.code_script = df_long.code_script.str.replace(r"#", "", regex=True).str.replace(r"\n", " ", regex=True)
df_long.loc[df_long.type_script == "m", "code_script"] = df_long.loc[df_long.type_script == "m", "code_script"].str.replace(r"\%", " ", regex=True)

df_long.loc[df_long.type_script == "r", "code_script"] = df_long.loc[df_long.type_script == "r", "code_script"].str.replace(r"\$", " ", regex=True)

# remove the same sign repeated more than 4 times
df_long.code_script = df_long.code_script.str.replace(r"(.)\1{4,}", r"\1", regex=True)
df_long.code_script = df_long.code_script.str.replace("\s{2,}", "", regex=True).str.strip()

df_long["Description"].progress_apply(lambda descr: " ".join(descr.split(".")[:2]))

In [13]:
df_long["Q_ID"] = df_long.index

df_long.to_csv(f"../../data/preprocessed/Quantlet/{DATE}/full_{DATE}.csv", index=False)

In [14]:
df_long["url"] = df_long.progress_apply(combine_url, axis=1)

100%|██████████| 4828/4828 [00:00<00:00, 42519.27it/s]


In [15]:
df_long[["Description", "url"]].to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/Description_annotation_{DATE}.csv",
    index=True,
)

In [17]:
# SPLIT THE DATA GROUP QUANTLET

labelled_qs, test = train_test_split(df_long, test_size=0.1, random_state=RS)
train, val = train_test_split(labelled_qs, test_size=0.1, random_state=RS)

save_datasets(train=train, val=val, test=test, DATE=DATE, RS=RS)

(3910, 17)
r     0.460614
m     0.287724
py    0.251662
Name: type_script, dtype: float64
(435, 17)
r     0.491954
m     0.268966
py    0.239080
Name: type_script, dtype: float64
(483, 17)
r     0.463768
m     0.279503
py    0.256729
Name: type_script, dtype: float64
(3910, 17)
r     1801
m     1125
py     984
Name: type_script, dtype: int64
(435, 17)
r     214
m     117
py    104
Name: type_script, dtype: int64
(483, 17)
r     224
m     135
py    124
Name: type_script, dtype: int64
