################################################################################
# 1. Preparations

In [49]:
%%capture
%pip install levenshtein

In [50]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [51]:
QPATH = "Quantlet/3-data-preprocessing"

In [52]:
# PREPARE WORKING DIRECTORY

import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [53]:
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. Processing

In [54]:
DATE = "20231014"
RS = 42

In [55]:
with open(
    f"../../data/preprocessed/Quantlet/{DATE}/Quantlets_{DATE}.pkl", "rb"
) as file:
    df = pickle.load(file)

In [56]:
df_long = df_metainfo_parse(df=df, prepare_script=True, remove_other=True)

(4837, 6)
(4836, 12)


In [57]:
df_long.repo.value_counts()

# 4 groups

# no neighbors
# less than 5 neighbors
# between 5 and 10 neighbors
# more than 10 neighbors

STF-ToDo                 368
SFE                      290
MVA-ToDo                 249
MVA                      225
STF                      224
                        ... 
SVCJ_MC                    1
DAIIkmeansEM               1
CardSpentplot              1
network_BTC_exchanges      1
Disaster                   1
Name: repo, Length: 327, dtype: int64

In [58]:
df_long = clean_up(df_long)
print(df_long.shape)

100%|███████████████████████████████████| 4836/4836 [00:00<00:00, 596122.20it/s]
100%|███████████████████████████████████████| 4836/4836 [01:12<00:00, 66.47it/s]
100%|███████████████████████████████████| 4836/4836 [00:00<00:00, 901736.20it/s]
100%|█████████████████████████████████████| 4836/4836 [00:00<00:00, 9295.47it/s]
100%|███████████████████████████████████| 4836/4836 [00:00<00:00, 715700.02it/s]
100%|█████████████████████████████████████| 4836/4836 [00:01<00:00, 3349.81it/s]
100%|███████████████████████████████████| 4836/4836 [00:00<00:00, 918477.37it/s]

(4828, 15)





In [59]:
df_long["Q_ID"] = df_long.index

df_long.to_csv(f"../../data/preprocessed/Quantlet/{DATE}/full_{DATE}.csv", index=False)

In [60]:
# CLEAN DESCRIPTIONS

In [61]:
df_long["url"] = df_long.progress_apply(combine_url, axis=1)

100%|████████████████████████████████████| 4828/4828 [00:00<00:00, 40154.71it/s]


In [62]:
df_long[["Description", "url"]].to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/Description_annotation_{DATE}.csv",
    index=True,
)

In [63]:
df_long[df_long.Description == ""]

Unnamed: 0,folder_name,code_script,type_script,script_name,Quantlet,Description,Keywords,Authors,scr_n,description_len,description_n_words,repo,code_len,new_len,new_len2,Q_ID,url


In [64]:
# SPLIT THE DATA
labelled_qs, test_qs = train_test_split(
    list(df_long.Quantlet.unique()), test_size=0.1, random_state=RS
)
train_qs, val_qs = train_test_split(labelled_qs, test_size=0.1, random_state=RS)


train = df_long[df_long["Quantlet"].isin(set(train_qs))].reset_index(drop=True)
val = df_long[df_long["Quantlet"].isin(set(val_qs))].reset_index(drop=True)
test = df_long[df_long["Quantlet"].isin(set(test_qs))].reset_index(drop=True)

full_train = (
    pd.concat([train, val], axis=0)
    .sample(frac=1, random_state=RS)
    .reset_index(drop=True)
)

In [65]:
full_train.to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/full_train_df_{DATE}_sample0.csv",
    index=False,
)
train.to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/train_df_{DATE}_sample0.csv", index=False
)
val.to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/val_df_{DATE}_sample0.csv", index=False
)
test.to_csv(
    f"../../data/preprocessed/Quantlet/{DATE}/test_df_{DATE}_sample0.csv", index=False
)


print(train.shape)
print(train["type_script"].value_counts(normalize=True))
print(val.shape)
print(val["type_script"].value_counts(normalize=True))
print(test.shape)
print(test["type_script"].value_counts(normalize=True))

print(train.shape)
print(train["type_script"].value_counts(normalize=False))
print(val.shape)
print(val["type_script"].value_counts(normalize=False))
print(test.shape)
print(test["type_script"].value_counts(normalize=False))

(3966, 17)
r     0.458396
m     0.289460
py    0.252143
Name: type_script, dtype: float64
(432, 17)
r     0.472222
py    0.275463
m     0.252315
Name: type_script, dtype: float64
(430, 17)
r     0.504651
m     0.279070
py    0.216279
Name: type_script, dtype: float64
(3966, 17)
r     1818
m     1148
py    1000
Name: type_script, dtype: int64
(432, 17)
r     204
py    119
m     109
Name: type_script, dtype: int64
(430, 17)
r     217
m     120
py     93
Name: type_script, dtype: int64


In [66]:
for MODE in ["no_context", "author", "repo"]:
    full_train = pd.read_csv(
        f"../../data/preprocessed/Quantlet/{DATE}/full_train_df_{DATE}_sample0.csv"
    )
    train = pd.read_csv(
        f"../../data/preprocessed/Quantlet/{DATE}/train_df_{DATE}_sample0.csv"
    )
    val = pd.read_csv(
        f"../../data/preprocessed/Quantlet/{DATE}/val_df_{DATE}_sample0.csv"
    )
    test = pd.read_csv(
        f"../../data/preprocessed/Quantlet/{DATE}/test_df_{DATE}_sample0.csv"
    )

    # FIX NA
    test.loc[test["Quantlet"].isna(), "Quantlet"] = "XFGexp_rtn_SRM_2d_DOENST RUN"
    train["Authors"] = train["Authors"].fillna("Unknown")
    val["Authors"] = val["Authors"].fillna("Unknown")
    test["Authors"] = test["Authors"].fillna("Unknown")

    if MODE == "repo":
        train.loc[:, "code_script"] = (
            "# repo: " + train["repo"] + "\n " + train["code_script"]
        )
        val.loc[:, "code_script"] = (
            "# repo: " + val["repo"] + "\n " + val["code_script"]
        )
        test.loc[:, "code_script"] = (
            "# repo: " + test["repo"] + "\n " + test["code_script"]
        )

    elif MODE == "author":
        train.loc[:, "code_script"] = (
            "# author: " + train["Authors"] + "\n " + train["code_script"]
        )
        val.loc[:, "code_script"] = (
            "# author: " + val["Authors"] + "\n " + val["code_script"]
        )
        test.loc[:, "code_script"] = (
            "# author: " + test["Authors"] + "\n " + test["code_script"]
        )

    train_dataset_json = {
        "version": "3.0",
        "data": [
            {
                "input_sequence": train["code_script"].iloc[i],
                "output_sequence": train["Description"].iloc[i],
            }
            for i in range(train.shape[0])
        ],
    }
    val_dataset_json = {
        "version": "3.0",
        "data": [
            {
                "input_sequence": val["code_script"].iloc[i],
                "output_sequence": val["Description"].iloc[i],
            }
            for i in range(val.shape[0])
        ],
    }

    full_train_dataset_json = {
        "version": "3.0",
        "data": [
            {
                "input_sequence": full_train["code_script"].iloc[i],
                "output_sequence": full_train["Description"].iloc[i],
            }
            for i in range(full_train.shape[0])
        ],
    }

    test_dataset_json = {
        "version": "3.0",
        "data": [
            {
                "input_sequence": test["code_script"].iloc[i],
                "output_sequence": test["Description"].iloc[i],
            }
            for i in range(test.shape[0])
        ],
    }

    with open(
        f"../../data/preprocessed/Quantlet/{DATE}/{MODE}/full_train_dataset_{DATE}_sample0.json",
        "w",
    ) as f:
        json.dump(full_train_dataset_json, f)

    with open(
        f"../../data/preprocessed/Quantlet/{DATE}/{MODE}/train_dataset_{DATE}_sample0.json",
        "w",
    ) as f:
        json.dump(train_dataset_json, f)

    with open(
        f"../../data/preprocessed/Quantlet/{DATE}/{MODE}/val_dataset_{DATE}_sample0.json",
        "w",
    ) as f:
        json.dump(val_dataset_json, f)

    with open(
        f"../../data/preprocessed/Quantlet/{DATE}/{MODE}/test_dataset_{DATE}_sample0.json",
        "w",
    ) as f:
        json.dump(test_dataset_json, f)