################################################################################
# 1. Preparations

In [1]:
%%capture
%pip install levenshtein

In [2]:
%load_ext lab_black

In [3]:
QPATH = "Quantlet/3-data-preprocessing"

In [60]:
# PREPARE WORKING DIRECTORY

import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [61]:
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. Processing

In [3]:
DATE = "20231021"
RS = 111

In [4]:
with open(
    f"../../data/preprocessed/Quantlet/20231014/Quantlets_20231014.pkl", "rb"
) as file:
    df = pickle.load(file)

In [5]:
df_long = df_metainfo_parse(df=df, prepare_script=True, remove_other=True)

(4837, 6)
(4836, 12)


In [6]:
df_long.repo.value_counts()

# 4 groups

# no neighbors
# less than 5 neighbors
# between 5 and 10 neighbors
# more than 10 neighbors

STF-ToDo                 368
SFE                      290
MVA-ToDo                 249
MVA                      225
STF                      224
                        ... 
SVCJ_MC                    1
DAIIkmeansEM               1
CardSpentplot              1
network_BTC_exchanges      1
Disaster                   1
Name: repo, Length: 327, dtype: int64

In [7]:
df_long = clean_up(df_long)
print(df_long.shape)

100%|████████████████████████████████████████████████████████████████████████████| 4836/4836 [00:00<00:00, 341131.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 4836/4836 [01:06<00:00, 72.99it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4836/4836 [00:00<00:00, 864863.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 4836/4836 [00:00<00:00, 9476.51it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4836/4836 [00:00<00:00, 437449.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 4836/4836 [00:01<00:00, 3060.48it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4836/4836 [00:00<00:00, 758494.28it/s]

(4828, 15)





In [46]:
if DATE=='20231021':
    # ADDITIONAL PREPROCESSING OF DESCRIPTIONS
    
    # remove parentheses
    df_long.Description = df_long.Description.str.replace(r"\(.+?\)", "", regex=True)
    
    # remove URL
    df_long.Description = df_long.Description.str.replace(r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""", "", regex=True)
    
    # reduce the amount of sentences

In [52]:
df_long["Q_ID"] = df_long.index

df_long.to_csv(f"../../data/preprocessed/Quantlet/{DATE}/full_{DATE}.csv", index=False)

In [53]:
# CLEAN DESCRIPTIONS

In [54]:
df_long["url"] = df_long.progress_apply(combine_url, axis=1)

100%|█████████████████████████████████████████████████████████████████████████████| 4828/4828 [00:00<00:00, 40856.80it/s]


In [55]:
df_long[["Description", "url"]].to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/Description_annotation_{DATE}.csv",
    index=True,
)

In [56]:
df_long[df_long.Description == ""]

Unnamed: 0,folder_name,code_script,type_script,script_name,Quantlet,Description,Keywords,Authors,scr_n,description_len,description_n_words,repo,code_len,new_len,new_len2,n_sentences,Q_ID,url


In [58]:
# SPLIT THE DATA GROUP QUANTLET

labelled_qs, test_qs = train_test_split(
    list(df_long.Quantlet.unique()), test_size=0.1, random_state=RS
)
train_qs, val_qs = train_test_split(labelled_qs, test_size=0.1, random_state=RS)


train = df_long[df_long["Quantlet"].isin(set(train_qs))].reset_index(drop=True)
val = df_long[df_long["Quantlet"].isin(set(val_qs))].reset_index(drop=True)
test = df_long[df_long["Quantlet"].isin(set(test_qs))].reset_index(drop=True)

save_datasets(train=train,
              val=val,
              test=test, 
              DATE=DATE,
              RS=RS)