In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#%pip install protobuf==3.20.1
%pip install -q transformers sentencepiece

In [None]:
QPATH = "Quantlet/Create_description"

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')

sys.path.append('../src')

In [None]:
import os
import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import  Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline

import importlib

In [None]:
class QuantletDataset(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        subset = self.dataframe.iloc[idx, :]['code_script']
        subset = subset.split('\\n')
        subset = [line for line in subset if len(line)>1]
        return subset

    def show_df(self):
        return display(self.dataframe)

In [None]:
QPATH = "Quantlet/Create_description"

In [None]:
CLEAN_UP = True

In [None]:
train_df = pd.read_csv('../../data/preprocessed/Quantlet/train_df.csv').reset_index()
val_df = pd.read_csv('../../data/preprocessed/Quantlet/val_df.csv').reset_index()
test_df = pd.read_csv('../../data/preprocessed/Quantlet/test_df.csv').reset_index()

In [None]:
pipeline = SummarizationPipeline(
    model=AutoModelWithLMHead.from_pretrained("SEBIS/code_trans_t5_base_source_code_summarization_python_multitask"),
    tokenizer=AutoTokenizer.from_pretrained("SEBIS/code_trans_t5_base_source_code_summarization_python_multitask",
    skip_special_tokens=False),
    device=0
)

In [None]:
torch_train_df = QuantletDataset(train_df)
torch_val_df = QuantletDataset(val_df)
torch_test_df = QuantletDataset(test_df)

In [None]:
data_loader = DataLoader(torch_train_df,
                         batch_size=1,
                         shuffle=False)

nl_description_list_train = []

for batch_idx, samples in tqdm(enumerate(data_loader)):
      try:
            nl_description = pipeline([codeline[0] for codeline in samples])

            if CLEAN_UP:
                  nl_description = [re.sub(r'[^a-zA-Z0-9\s]', '', summary['summary_text']) for summary in nl_description]
                  nl_description = [re.sub(r'\s+', ' ', summary).strip() for summary in nl_description]
                  nl_description = [summary for summary in nl_description if len(summary) > 0]
      except:
            nl_description = []
      nl_description_list_train.append({batch_idx : nl_description})

In [None]:
data_loader = DataLoader(torch_val_df,
                         batch_size=1,
                         shuffle=False)

nl_description_list_val = []
for batch_idx, samples in tqdm(enumerate(data_loader)):
      try:
            nl_description = pipeline([codeline[0] for codeline in samples])
            if CLEAN_UP:
                  nl_description = [re.sub(r'[^a-zA-Z0-9\s]', '', summary['summary_text']) for summary in nl_description]
                  nl_description = [re.sub(r'\s+', ' ', summary).strip() for summary in nl_description]
                  nl_description = [summary for summary in nl_description if len(summary) > 0]
      except:
            nl_description = []
      nl_description_list_val.append({batch_idx : nl_description})

In [None]:
data_loader = DataLoader(torch_test_df,
                         batch_size=1,
                         shuffle=False)

nl_description_list_test = []
for batch_idx, samples in tqdm(enumerate(data_loader)):
      try:
            nl_description = pipeline([codeline[0] for codeline in samples])
            if CLEAN_UP:
                  nl_description = [re.sub(r'[^a-zA-Z0-9\s]', '', summary['summary_text']) for summary in nl_description]
                  nl_description = [re.sub(r'\s+', ' ', summary).strip() for summary in nl_description]
                  nl_description = [summary for summary in nl_description if len(summary) > 0]
      except:
            nl_description = []
      nl_description_list_test.append({batch_idx : nl_description})

In [None]:
len(nl_description_list_test)

In [None]:
with open('../../data/preprocessed/Quantlet/Descriptions_Qs_train_27072023.pkl', 'wb') as f:
        pickle.dump(nl_description_list_train, f)


with open('../../data/preprocessed/Quantlet/Descriptions_Qs_val_27072023.pkl', 'wb') as f:
        pickle.dump(nl_description_list_val, f)


with open('../../data/preprocessed/Quantlet/Descriptions_Qs_test_27072023.pkl', 'wb') as f:
        pickle.dump(nl_description_list_test, f)