################################################################################
# 1. Preparations

In [1]:
%%capture
%pip install levenshtein

In [2]:
QPATH = "Quantlet/3-data-preprocessing"

In [3]:
# PREPARE WORKING DIRECTORY

import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    os.chdir("./")

# sys.path.append('../src')

In [4]:
%%capture
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download("punkt")

from tqdm import tqdm

tqdm.pandas()

import importlib
import preprocessing_utils

importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split
from Levenshtein import distance

# SETTINGS

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option("display.max_colwidth", 500)

# 2. Processing

In [5]:
DATE = "20231104"
RS = 111

In [6]:
with open(
    f"../../data/preprocessed/Quantlet/20231027/Quantlets_20231027.pkl", "rb"
) as file:
    df = pickle.load(file)

In [7]:
df = df_metainfo_parse(df=df,
                    prepare_script=True,
                    remove_other=True,
                    remove_empty=False)

df = clean_up(df)
print(df.shape)

(5018, 6)
(5017, 12)


100%|██████████| 5017/5017 [00:35<00:00, 141.89it/s]
100%|██████████| 5017/5017 [00:58<00:00, 86.44it/s]  
100%|██████████| 5017/5017 [00:00<00:00, 11552.66it/s]
100%|██████████| 5017/5017 [00:01<00:00, 2921.56it/s]
100%|██████████| 5017/5017 [00:00<00:00, 347058.04it/s]

(5009, 13)





In [8]:
df['script_name_no_ext'] = df.script_name.str.split('.', expand=True)[0]
df['main_script'] = df['script_name_no_ext']==df['Quantlet']
df = df.loc[df['main_script']==True, :]

In [9]:
# ADDITIONAL PREPROCESSING OF DESCRIPTIONS

# remove parentheses
df.Description = df.Description.str.replace(r"\(.+?\)", "", regex=True)

# remove URL
df.Description = df.Description.str.replace(
r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""",
"",
regex=True)

# ADDITIONAL PREPROCESSING OF CODE
df.code_script = df.code_script.str.replace(r"#", "", regex=True)
df.loc[df.type_script == "m", "code_script"] = df.loc[df.type_script == "m", "code_script"].str.replace(r"\%", " ", regex=True)

df.loc[df.type_script == "r", "code_script"] = df.loc[df.type_script == "r", "code_script"].str.replace(r"\$", " ", regex=True)

# remove the same sign repeated more than 4 times
df.code_script = df.code_script.str.replace(r"(.)\1{4,}", r"\1", regex=True)
df.code_script = df.code_script.str.replace("\s{2,}", "", regex=True).str.strip()

In [10]:
df.code_script

3       Plot energy production of all relevant prosumers in testing periodAuthor: Michael KostmannLoad packages\npackages = c("cowplot",\n"purrr")\ninvisible(lapply(packages, library, character.only = TRUE))Source user-defined functions\nfunctions = c(" FUN get Targets. R",\n" FUN generate Prices. R",\n" FUN blind Auction. R")\ninvisible(lapply(functions, source))Function for easy string pasting\n"%&%" = function(x, y) {paste(x, y, sep = "")}Specify paths to directories containing consumer and prosu...
5       Save data glimpse of energy smart meter recordingsAuthor: Michael KostmannLoad packages\npackages = c("data.table",\n"lubridate",\n"tidyverse",\n"tibbletime")\ninvisible(lapply(packages, library, character.only = TRUE))Function for easy string pasting\n"%&%" = function(x, y) {paste(x, y, sep = "")}Specify datasets to load\ndataset ids = c("consumer/consumer-056",\n"prosumer/producer-089")Loop over datasets specified in datasets ids\nfor(i in dataset ids) {Load raw data from csv-f

In [11]:
df['Description_ID'] = df.groupby('Description').ngroup()

In [12]:
'''from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                        model="facebook/bart-large-mnli")
candidate_labels = ['project-level', 'codesnippet-level']
descriptions = df.Description.unique()
descriptions[0]
classifier("Plots the power curves by 2SQR(1), 2SQR(2) and IVX-QR in simulation.'", candidate_labels)'''

'from transformers import pipeline\nclassifier = pipeline("zero-shot-classification",\n                        model="facebook/bart-large-mnli")\ncandidate_labels = [\'project-level\', \'codesnippet-level\']\ndescriptions = df.Description.unique()\ndescriptions[0]\nclassifier("Plots the power curves by 2SQR(1), 2SQR(2) and IVX-QR in simulation.\'", candidate_labels)'

In [13]:
'''# CHUNKING
df[['chunk_ids', 'chunks']] = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')
chunks_df = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')
chunks_df.columns = ['chunk_ids', 'chunks']
df[['chunk_ids', 'chunks']]  = chunks_df[['chunk_ids', 'chunks']] 
df = df.explode('chunk_ids').reset_index(drop=True)
df['chunks'] = df.apply(lambda x: x['chunks'][x['chunk_ids']], axis=1)
df.shape'''

"# CHUNKING\ndf[['chunk_ids', 'chunks']] = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')\nchunks_df = df.apply(lambda x: chunk_code(x['code_script'], chunk_size=250), axis=1, result_type='expand')\nchunks_df.columns = ['chunk_ids', 'chunks']\ndf[['chunk_ids', 'chunks']]  = chunks_df[['chunk_ids', 'chunks']] \ndf = df.explode('chunk_ids').reset_index(drop=True)\ndf['chunks'] = df.apply(lambda x: x['chunks'][x['chunk_ids']], axis=1)\ndf.shape"

In [14]:
df["Q_ID"] = df.index

folder_to_save = f"../../data/preprocessed/Quantlet/{DATE}/"
if not os.path.exists(folder_to_save):
    os.mkdir(folder_to_save)

df.to_csv(f'{folder_to_save}full_{DATE}.csv', index=False)

In [15]:
df = pd.read_csv(f'{folder_to_save}full_{DATE}.csv')

'''code_list = df.groupby(['folder_name'])['code_script'].apply(list)
df['code_script'] = df['folder_name'].map(code_list)
df['code_script'] = df['code_script'].apply(lambda x: '\n\n'.join(x))'''

'''code_list = df.groupby(['folder_name'])['type_script'].apply(list)
df['type_script'] = df['folder_name'].map(code_list)
df['type_script'] = df['type_script'].apply(lambda x: ' '.join(x))'''

"code_list = df.groupby(['folder_name'])['type_script'].apply(list)\ndf['type_script'] = df['folder_name'].map(code_list)\ndf['type_script'] = df['type_script'].apply(lambda x: ' '.join(x))"

In [16]:
'''print(df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'Q_ID', 'folder_name', 'repo', 'Authors']].shape)

df_reduced = df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'type_script', 'folder_name', 'repo', 'Authors']].drop_duplicates(['Quantlet', 'Description', 'Description_ID', 'folder_name', 'repo', 'Authors'])
df_reduced.shape'''

"print(df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'Q_ID', 'folder_name', 'repo', 'Authors']].shape)\n\ndf_reduced = df[['code_script', 'Quantlet', 'Description', 'Description_ID', 'type_script', 'folder_name', 'repo', 'Authors']].drop_duplicates(['Quantlet', 'Description', 'Description_ID', 'folder_name', 'repo', 'Authors'])\ndf_reduced.shape"

In [17]:
# SPLIT THE DATA GROUP QUANTLET
labelled_descr_id, test_descr_id = train_test_split(list(df.Description_ID.unique()),
                test_size=0.1,
                random_state=RS)
train_descr_id, val_descr_id = train_test_split(labelled_descr_id,
                test_size=0.1,
                random_state=RS)

In [18]:
full_train = df.loc[df.Description_ID.isin(labelled_descr_id)]
train = df.loc[df.Description_ID.isin(train_descr_id)]
val = df.loc[df.Description_ID.isin(val_descr_id)]
test = df.loc[df.Description_ID.isin(test_descr_id)]

In [19]:
save_datasets(full_train, train, val, test, DATE, RS, 'code_script', False, True)    

(2735, 17)
r     0.562706
m     0.231444
py    0.205850
Name: type_script, dtype: float64
(305, 17)
r     0.544262
py    0.229508
m     0.226230
Name: type_script, dtype: float64
(327, 17)
r     0.584098
m     0.247706
py    0.168196
Name: type_script, dtype: float64
(2735, 17)
r     1539
m      633
py     563
Name: type_script, dtype: int64
(305, 17)
r     166
py     70
m      69
Name: type_script, dtype: int64
(327, 17)
r     191
m      81
py     55
Name: type_script, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Authors"] = train["Authors"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["Authors"] = val["Authors"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a D