In [28]:
%pip install transformers==4.28.1
%pip install datasets
%pip install sentencepiece
%pip install rouge_score
%pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import re
import os
import sys
import json 
import ast

sys.path.append('../../Quantlet/Create_description/')

import importlib
import preprocessing_utils
importlib.reload(preprocessing_utils)

import pickle

from sklearn.model_selection import train_test_split, KFold

In [3]:
RS = 42

In [4]:
# LOAD created descriptions and the parsed Quantlets
parsed_Qs_file = '../../data/preprocessed/Quantlet/Parsed_Qs_19032023.pkl'
dataset = preprocessing_utils.QuantletDataset(parsed_Qs_file)

with open('../../data/preprocessed/Quantlet/Descriptions_Qs_19032023.pkl', 'rb') as f:
    descriptions = pickle.load(f)

# Identify the most common language in each folder containing metainfo file
dataset = dataset.parsed_Qs_file
dataset.type_script = dataset.type_script.str.replace('ipynb', 'py')
dataset['most_commen_lang'] = dataset.folder_name.map(dataset.groupby(['folder_name'])['type_script'].agg(pd.Series.mode))
dataset['most_commen_lang'] = dataset['most_commen_lang'].astype(str)

# create the additional index to merge discription back to the dataset
dataset['desc_idx'] = dataset.index

In [5]:
unique_metainfo_files = dataset.metainfo_file.astype(str).unique()

In [6]:
tocheck_data = dataset.copy()
tocheck_data.metainfo_file = tocheck_data.metainfo_file.astype(str)
tocheck_data['empty'] = (tocheck_data.metainfo_file=='empty').astype(int)
tocheck_data['empty_ratio'] = tocheck_data.folder_name.map(tocheck_data.groupby('folder_name').empty.sum()) / tocheck_data.folder_name.map(tocheck_data.groupby('folder_name').empty.count())

# identify the repos that do not have metainfo files at all
q_no_meta = tocheck_data[tocheck_data.empty_ratio==1].folder_name.unique()

In [7]:
# remove duplicates
Q_lang = dataset[['folder_name', 'most_commen_lang']].drop_duplicates().reset_index(drop=True)
Q_lang = Q_lang[~Q_lang.folder_name.isin(q_no_meta)]

In [8]:
# create the train and test set
labelled, test = train_test_split(Q_lang, test_size=0.2, random_state=RS, stratify=Q_lang.most_commen_lang)
print(labelled.shape, test.shape)

(2376, 2) (595, 2)


In [9]:
dataset.metainfo_file.iloc[8374]

'{\'Name of Quantlet\': \'DataExpo2013_QuachSymanzikForsgren\', \'Published in\': \'Computational Statistics, Data Expo 2013 Special Issue, Volume 31, Issue 3, 2016\', \'Description\': \'This repository contains supplementary material for the paper "Soul of the Community: An Attempt to Assess Attachment to a Community" to enable the reader to reproduce the analysis.\', \'Keywords\': \'Machine Learning, Statistical Learning, Data Mining, Predictive Analytics, Random Forests, Archetypes, Knight Foundation\', \'See also\': \'Other articles in Data Expo 2013 Special Issue, accessible at https://github.com/COSTDataExpo2013\', \'Author\': \'Anna Quach, Juergen Symanzik, Nicole Forsgren\', \'Submitted\': \'August 27, 2015\', \'Datafile\': \'knightfoundation2008sotcdata.por, knightfoundation2009sotcdata.por, knightfoundation2010sotcdata.por\', \'Main function\': \'Read_Me.R\', \'Example\': [{1: \'archetype_convex_hull_2008.png -- Fig. 7: Graphical representation of the three archetype solution

In [10]:
def get_metainfo_field(row, field='keywords'):
    try :
        row = ast.literal_eval(row)
    except:
        return 'empty'
    row = {k.lower():v for (k,v) in row.items()}
    if field in row.keys():
        return row[field]
    else:
        return 'empty'
    
def set_keywords_to_list(row):
    if row=='empty':
        return []
    elif isinstance(row, list):
        return row
    else:
        return [x.strip() for x in row.split(',')]
    
def get_keywords_len(row):
    if row=='empty':
        return 0
    else:
        return len(row)

In [11]:
# extract keywords from metainfo file
dataset['keywords'] = dataset.metainfo_file.apply(get_metainfo_field)

# remove with empty keywords
dataset = dataset[~dataset.keywords.isna()]

# keywords to list
dataset['keywords'] = dataset.keywords.apply(set_keywords_to_list)



In [12]:
# preprocess keywords
dataset['keywords_n'] = dataset['keywords'].apply(get_keywords_len)


In [13]:
no_meta_ds = dataset.query('folder_name in @q_no_meta')
labelled_ds = dataset.query('folder_name in @labelled.folder_name')
test_ds = dataset.query('folder_name in @test.folder_name')

print(labelled_ds.shape, test_ds.shape)
full_shape = labelled_ds.shape[0] + test_ds.shape[0]
print(labelled_ds.shape[0] / full_shape, test_ds.shape[0]/full_shape)

(3863, 9) (968, 9)
0.7996274063340923 0.20037259366590768


In [14]:
# extract descriptions for train, test, no meta datasets
descriptions_labelled = [descriptions[i] for i in labelled_ds.desc_idx.values]
descriptions_test = [descriptions[i] for i in labelled_ds.desc_idx.values]
descriptions_no_meta = [descriptions[i] for i in no_meta_ds.desc_idx.values]

In [15]:
# Extract description lists
descriptions_labelled = [list(descr_dict.values()) for descr_dict in descriptions_labelled]
descriptions_test = [list(descr_dict.values()) for descr_dict in descriptions_test]
descriptions_no_meta = [list(descr_dict.values()) for descr_dict in descriptions_no_meta]

In [29]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

import nltk
from datetime import datetime

In [18]:
model_name = "sshleifer/distilbart-xsum-12-3"

In [19]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
# tokenization
encoder_max_length = 256  # demo
decoder_max_length = 6

In [26]:
def flatten(idx, description_ds, keywords_ds):
    return {
        "code_description": description_ds[idx],
        "keywords": keywords_ds.iloc[idx],
    }

In [30]:
from datasets import Dataset

In [None]:
def list2samples(idx, description_ds, keywords_ds):
    documents = []
    summaries = []
    for sample in zip(description_ds[idx], keywords_ds.iloc[idx]):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {"document": documents, "summary": summaries}


dataset = data.map(flatten, remove_columns=["article", "url"])
dataset = dataset.map(list2samples, batched=True)

train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()