################################################################################
# 1. Preparations

In [1]:
QPATH = "Quantlet/code_description2project_description"

In [2]:
# PREPARE WORKING DIRECTORY

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')
else:
  os.chdir('./')

sys.path.append('../src')

In [3]:
# PACKAGES

import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')

from tqdm import tqdm
tqdm.pandas()
import preprocessing_utils

import importlib
importlib.reload(preprocessing_utils)
from preprocessing_utils import *

from sklearn.model_selection import train_test_split

# SETTINGS

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. Processing

In [4]:
with open('../../data/preprocessed/Quantlet/Parsed_Qs_with_code_25062023.pkl', 'rb') as file:
  df = pickle.load(file)

In [5]:
RS = 42

In [6]:
df = df[df.metainfo_file!='empty']
print(df.shape)

(4856, 6)


In [7]:
# Parse metainfo file
if 'Keywords' not in df.columns:
  meta_info = pd.DataFrame(columns=['Quantlet', 'Description', 'Keywords', 'Other'])

  meta_info[['Quantlet', 'Description', 'Keywords', 'Other']] = df.apply(
      lambda x: parse_meta(x),
      axis='columns',
      result_type='expand'
      )

  for col in meta_info.columns:
      meta_info[col] = meta_info[col].astype(str)

  df = pd.concat([df, meta_info], axis=1)

  del df['metainfo_file']
  del df['Other']
  del df['script_name']
  del df['script_name_no_ext']

In [8]:
df['multiple_scripts'] = df['code_script'].apply(lambda x: any(isinstance(i, list) for i in x))
df['code_script_joined'] = ''
df.loc[df['multiple_scripts']==True, 'code_script_joined'] = df.loc[df['multiple_scripts']==True, 'code_script'].apply(lambda x: [''.join(code_script) for code_script in x])
df.loc[df['multiple_scripts']!=True, 'code_script_joined'] = df.loc[df['multiple_scripts']!=True, 'code_script'].apply(lambda x: [''.join(x)])
df['scr_n'] = df['code_script_joined'].apply(len)
df['description_len'] = df['Description'].apply(len)
df['description_n_words'] = df['Description'].apply(lambda x: len(x.split()))
df = df.reset_index(drop=True)

In [9]:
df['main_script'] = np.nan
df['main_type_script'] = np.nan

In [10]:
def explode_code_and_lang(df):
    new_df = pd.DataFrame()

    print(f'Shape before exploding scripts: {df.shape}')

    for index, row in tqdm(df.iterrows()):
        if row['multiple_scripts']==True:
          for i, script in enumerate(row['code_script']):
              row['main_script'] = script
              row['main_type_script'] = row['type_script'][i]
              new_df = new_df.append(row)
        else:
          new_df = new_df.append(row)

    new_df['main_script'] = new_df['main_script'].fillna(new_df['code_script'])
    new_df['main_type_script'] = new_df['main_type_script'].fillna(new_df['type_script'])

    new_df = new_df.reset_index(drop=True)
    print(f'Shape after exploding scripts: {new_df.shape}')
    return new_df


In [11]:
df_long = explode_code_and_lang(df)

df_long['code_script'] = df_long['main_script'].progress_apply(lambda x: ''.join(x) if len(x)>0 else np.nan)
df_long['type_script'] = df_long['main_type_script']

del df_long['main_type_script']
del df_long['main_script']
del df_long['code_script_joined']

df_long = df_long[df_long['Description'].notna()]
df_long = df_long[df_long['code_script'].notna()]

Shape before exploding scripts: (4856, 13)


4856it [00:24, 196.05it/s]


Shape after exploding scripts: (6743, 13)


100%|██████████| 6743/6743 [00:00<00:00, 44541.34it/s]


In [12]:
def add_docstring_comment_tags_py(string):
    result = string.replace('\r', '')
    s_com = re.compile(r"(#*)(.*)\n")
    s_m = re.compile(r'("""|\'\'\')(.*?)\1', re.DOTALL)

    result = re.sub(s_com, r"<COMMENT S> \2 <COMMENT E>\n", result, re.DOTALL)
    result = re.sub(s_m, r'<DOCSTR START>\2<DOCSTR END>\n', result, re.DOTALL)
    return result

def add_docstring_comment_tags_r(string):
    result = string.replace('\r', '')
    s_com = re.compile(r"(#*)(.*)\n")
    s_m = re.compile(r"#'\n(.*?)\n#'", re.DOTALL)

    result = re.sub(s_com, r"<COMMENT S> \2 <COMMENT E>\n", result, re.DOTALL)
    result = re.sub(s_m, r'<DOCSTR START>\1<DOCSTR END>\n', result, re.DOTALL)
    return result

def add_docstring_comment_tags_matlab(string):
    result = string.replace('\r', '')
    s_com = re.compile(r"(%*)(.*)")
    s_m = re.compile(r"%\{\n(.*?)\n%\}", re.DOTALL)

    result = re.sub(s_com, r"<COMMENT S> \2 <COMMENT E>\n", result, re.DOTALL)
    result = re.sub(s_m, r'<DOCSTR START>\1<DOCSTR END>\n', result, re.DOTALL)
    return result

def add_docstring_comment_tags(string, lang):
    if lang=='py':
       result = add_docstring_comment_tags_py(string)
    elif lang=='m':
       result = add_docstring_comment_tags_matlab(string)
    elif lang=='r':
       result = add_docstring_comment_tags_r(string)
    return result

In [13]:
df_long.loc[df_long['type_script'].apply(lambda x: isinstance(x, list)), 'type_script'] = 'py'
df_long.loc[df_long['type_script']=='ipynb', 'type_script'] = 'py'

In [14]:
df_long['code_script'] = df_long.apply(
    lambda x: add_docstring_comment_tags(x['code_script'], x['type_script']),
    axis=1
    )

In [15]:
labelled_qs, test_qs = train_test_split(list(df_long.Quantlet.unique()),
                                     test_size=0.1,
                                     random_state=RS)
train_qs, val_qs = train_test_split(labelled_qs,
                      test_size=0.1,
                      random_state=RS)


train = df_long[df_long['Quantlet'].isin(set(train_qs))]
val   = df_long[df_long['Quantlet'].isin(set(val_qs))]
test  = df_long[df_long['Quantlet'].isin(set(test_qs))]

In [16]:
print(train.shape)
print(train['type_script'].value_counts(normalize=True))

(5537, 10)
r     0.413220
m     0.322016
py    0.264764
Name: type_script, dtype: float64


In [17]:
print(val.shape)
print(val['type_script'].value_counts(normalize=True))

(527, 10)
r     0.514231
m     0.254269
py    0.231499
Name: type_script, dtype: float64


In [18]:
print(test.shape)
print(test['type_script'].value_counts(normalize=True))

(667, 10)
m     0.382309
r     0.374813
py    0.242879
Name: type_script, dtype: float64


# 3. Save the Data

In [19]:
train_dataset_json = {'version' : '0.1.0',
                     'data' : [{'input_sequence'  : train['code_script'].iloc[i],
                                'output_sequence' : train['Description'].iloc[i]} for i in range(train.shape[0])]}
val_dataset_json = {'version' : '0.1.0',
                     'data' : [{'input_sequence'  : val['code_script'].iloc[i],
                                'output_sequence' : val['Description'].iloc[i]} for i in range(val.shape[0])]}

test_dataset_json = {'version' : '0.1.0',
                     'data' : [{'input_sequence'  : test['code_script'].iloc[i],
                                'output_sequence' : test['Description'].iloc[i]} for i in range(test.shape[0])]}


with open('../../data/preprocessed/Quantlet/train_dataset_descr.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('../../data/preprocessed/Quantlet/val_dataset_json.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('../../data/preprocessed/Quantlet/test_dataset_descr.json', 'w') as f:
    json.dump(test_dataset_json, f)