In [2]:
%%capture
%pip install transformers[torch]
%pip install -q sentencepiece
%pip install datasets==2.13.1

In [3]:
import pandas as pd
import json
import random
import numpy as np
from tqdm import tqdm

In [4]:
QPATH = "Quantlet/3-data-preprocessing"
import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')

In [5]:
train = pd.read_csv('../../data/preprocessed/Quantlet/train_df_with_domain_20231002.csv')
val = pd.read_csv('../../data/preprocessed/Quantlet/val_df_with_domain_20231002.csv')
test = pd.read_csv('../../data/preprocessed/Quantlet/test_df_with_domain_20231002.csv')

In [6]:
import collections
import nltk
nltk.download('stopwords')

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
import re
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
tokens_descriptions = []

train['Description'].apply(lambda title: tokens_descriptions.extend(re.findall(r"(?u)\b\w\w+\b", title.lower())))

tokens_descriptions = [token for token in tokens_descriptions if not token.isnumeric()]

# using Counter to find frequency of elements
frequency = dict(collections.Counter(tokens_descriptions))

# printing the frequency
freq_df = pd.DataFrame(data={'token': frequency.keys(), 'freq': frequency.values()})
freq_df = freq_df.sort_values('freq', ascending=False)
freq_df = freq_df[~freq_df.token.isin(stopwords.words('english'))]
freq_df = freq_df.loc[freq_df.freq>4]

tokens_descriptions = freq_df.copy()
tokens_descriptions.token = tokens_descriptions.token.apply(lambda x: ps.stem(x))
tokens_descriptions = tokens_descriptions.groupby('token', as_index=False).sum().sort_values('freq', ascending=False)

In [21]:
def calc_domain2(dataset):
    dataset['code_script_prep'] = dataset['code_script'].progress_apply(lambda x : [ps.stem(token.lower()) for token in re.sub('\W+', ' ', x).strip().split() if token.lower() not in stopwords.words('english')])
    dataset['code_script_prep'] = dataset['code_script_prep'].apply(lambda x: [token for token in x if len(token)>2])
    dataset['code_script_prep_len'] = dataset['code_script_prep'].apply(len)
    dataset['code_script_domain'] = dataset['code_script_prep'].progress_apply(lambda x : [token for token in x if token in set(tokens_descriptions.token.values)])
    dataset['code_script_domain_len'] = dataset['code_script_domain'].apply(len)
    dataset['domain_ratio'] = round(dataset['code_script_domain_len'] / dataset['code_script_prep_len'], 3)
    return dataset

In [23]:
train = calc_domain2(train)

100%|██████████| 3933/3933 [04:56<00:00, 13.25it/s]
100%|██████████| 3933/3933 [02:35<00:00, 25.33it/s]


In [27]:
test = calc_domain2(test)

100%|██████████| 461/461 [00:22<00:00, 20.28it/s]
100%|██████████| 461/461 [00:13<00:00, 32.96it/s]


In [28]:
val['id']  = val.index
test['id'] = test.index

In [29]:
q33 = train.domain_ratio.quantile(q=0.33)
q66 = train.domain_ratio.quantile(q=0.66)

In [30]:
def domain_group(domain_ratio, q33, q66):
  group = 0
  if (domain_ratio >= q33) & (domain_ratio < q66):
    group = 1
  elif domain_ratio>=q66:
    group = 2
  return group

In [31]:
train['domain_group'] = train['domain_ratio'].apply(lambda x : domain_group(x, q33, q66))
val['domain_group']   = val['domain_ratio'].apply(lambda x : domain_group(x, q33, q66))
test['domain_group']  = test['domain_ratio'].apply(lambda x : domain_group(x, q33, q66))

In [33]:
print(train['domain_group'].value_counts())
print(val['domain_group'].value_counts())
print(test['domain_group'].value_counts())

2    1341
1    1309
0    1283
Name: domain_group, dtype: int64
2    291
1    101
0     43
Name: domain_group, dtype: int64
2    185
1    142
0    134
Name: domain_group, dtype: int64


### DOMAIN GROUPS

In [34]:
for domain_group in val.domain_group.unique():
  group_val = val.loc[val.domain_group == domain_group, : ]
  val_dataset_json = {'version' : str(domain_group),
                      'data' : [{'input_sequence'  : group_val['code_script'].iloc[i],
                                'output_sequence'  : group_val['Description'].iloc[i]} for i in range(group_val.shape[0])]}
  with open(f'../../data/preprocessed/Quantlet/20231013/val_dataset_{domain_group}_20231013_sample0.json', 'w') as f:
    json.dump(val_dataset_json, f)

In [35]:
for domain_group in test.domain_group.unique():
  group_test = test.loc[test.domain_group == domain_group, : ]
  test_dataset_json = {'version' : str(domain_group),
                      'data' : [{'input_sequence'  : group_test['code_script'].iloc[i],
                                'output_sequence'  : group_test['Description'].iloc[i]} for i in range(group_test.shape[0])]}
  with open(f'../../data/preprocessed/Quantlet/20231013/test_dataset_{domain_group}_20231013_sample0.json', 'w') as f:
    json.dump(test_dataset_json, f)

### PROGRAMMING LANGUAGES

In [36]:
for type_script in val.type_script.unique():
  group_val = val.loc[val.type_script == type_script, : ]
  val_dataset_json = {'version' : type_script,
                      'data' : [{'input_sequence'  : group_val['code_script'].iloc[i],
                                'output_sequence'  : group_val['Description'].iloc[i]} for i in range(group_val.shape[0])]}
  with open(f'../../data/preprocessed/Quantlet/20231013/val_dataset_{type_script}_20231013_sample0.json', 'w') as f:
    json.dump(val_dataset_json, f)

In [37]:
for type_script in test.type_script.unique():
  group_test = test.loc[test.type_script == type_script, : ]
  test_dataset_json = {'version' : type_script,
                      'data' : [{'input_sequence'  : group_test['code_script'].iloc[i],
                                'output_sequence'  : group_test['Description'].iloc[i]} for i in range(group_test.shape[0])]}
  with open(f'../../data/preprocessed/Quantlet/20231013/test_dataset_{type_script}_20231013_sample0.json', 'w') as f:
    json.dump(test_dataset_json, f)

In [38]:
# create bootstrap
SIZE = test.shape[0]
indices = range(SIZE)
N_SAMPLES = 35

for sample in tqdm(range(1, N_SAMPLES)):
    np.random.seed(sample)
    sample_idx = np.random.choice(indices, size=SIZE, replace=True)
    sample_df = test.iloc[sample_idx, : ].reset_index(drop=True)
    sample_df.to_csv(f'../../data/preprocessed/Quantlet/20231013/test_df_with_domain_20231013_sample{sample}.csv', index=False)


    # DOMAIN
    for domain_group in sample_df.domain_group.unique():
      group_test = sample_df.loc[sample_df.domain_group == domain_group, : ]
      test_dataset_json = {'version' : str(domain_group),
                          'data' : [{'input_sequence'  : group_test['code_script'].iloc[i],
                                    'output_sequence'  : group_test['Description'].iloc[i]} for i in range(group_test.shape[0])]}
      with open(f'../../data/preprocessed/Quantlet/20231013/test_dataset_{domain_group}_20231013_sample{sample}.json', 'w') as f:
        json.dump(test_dataset_json, f)

    # PROGRAMMING LANGUAGE
    for type_script in sample_df.type_script.unique():
      group_test = sample_df.loc[sample_df.type_script == type_script, : ]
      test_dataset_json = {'version' : type_script,
                          'data' : [{'input_sequence'  : group_test['code_script'].iloc[i],
                                    'output_sequence'  : group_test['Description'].iloc[i]} for i in range(group_test.shape[0])]}
      with open(f'../../data/preprocessed/Quantlet/20231013/test_dataset_{type_script}_20231013_sample{sample}.json', 'w') as f:
        json.dump(test_dataset_json, f)

100%|██████████| 34/34 [02:23<00:00,  4.21s/it]
