In [None]:
import ast
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
# set paths
context_raw_path = Path('./dolly_ggltrans/context_trans_raw/context_en')
response_raw_path = Path('./dolly_ggltrans/response_trans_raw/response_en/')
dolly_instructions_path = Path('./dolly_ggltrans/dolly_instruction_en_translated.json')

In [None]:
# read data
context_raw_df = pd.concat([pd.read_csv(x) for x in context_raw_path.glob('*.csv')])
response_raw_df = pd.concat([pd.read_csv(x) for x in response_raw_path.glob('*.csv')])
dolly_instructions_df = pd.read_json(dolly_instructions_path)

In [None]:
context_raw_df = (
    context_raw_df
    .rename(columns={'Unnamed: 0':'index'})
    .set_index('index')
    [['context_en_trans']]
)
response_raw_df = (
    response_raw_df
    .rename(columns={'Unnamed: 0':'index'})
    .set_index('index')
    [['response_en_trans']]
)

In [None]:
context_raw_df['context_en_trans'] = context_raw_df['context_en_trans'].apply(ast.literal_eval)
response_raw_df['response_en_trans'] = response_raw_df['response_en_trans'].apply(ast.literal_eval)

In [None]:
context_raw_df['context_success'], context_raw_df['context_pl'] = zip(*context_raw_df['context_en_trans'])
response_raw_df['response_success'], response_raw_df['response_pl'] = zip(*response_raw_df['response_en_trans'])

In [None]:
context_raw_df = context_raw_df.drop(columns='context_en_trans')
response_raw_df = response_raw_df.drop(columns='response_en_trans')

In [None]:
# merge data and clean
dolly_interim = dolly_instructions_df.merge(
    context_raw_df,
    left_index=True,
    right_index=True,
    how='left'
)

dolly_trans = dolly_interim.merge(
    response_raw_df,
    left_index=True,
    right_index=True,
    how='left'
)

dolly_trans = dolly_trans.drop(columns='Unnamed: 0')
trans_fields = ['instruction_en', 'context_en', 'response_en', 'instruction_pl', 'context_pl', 'response_pl']
for field in trans_fields:
    dolly_trans[field] = dolly_trans[field].str.strip()
    dolly_trans[field] = dolly_trans[field].fillna('')
    dolly_trans[field] = dolly_trans[field].str.replace(r'^(Zero|Null)$', '', regex=True)
    dolly_trans[field] = dolly_trans[field].replace('', np.nan)

## QA
missing_instructions_en = dolly_trans['instruction_en'].isnull().sum()
missing_contexts_en = dolly_trans['context_en'].isnull().sum()
missing_responses_en = dolly_trans['response_en'].isnull().sum()

missing_instructions_pl = dolly_trans['instruction_pl'].isnull().sum()
missing_contexts_pl = dolly_trans['context_pl'].isnull().sum()
missing_responses_pl = dolly_trans['response_pl'].isnull().sum()

missing_change_instr = missing_instructions_pl - missing_instructions_en
missing_change_context = missing_contexts_pl - missing_contexts_en
missing_change_response = missing_responses_pl - missing_responses_en

print(missing_change_instr, missing_change_context, missing_change_response)

for field in ['instr_trans_success','context_success','response_success']:
    print(field)
    print(dolly_trans[field].sum())
    print(dolly_trans[field].sum()/dolly_trans.shape[0])

# Zapisz dataset
dolly_trans.to_json('./dolly_ggltrans/dolly_ggltranslated.json')
dolly_trans.to_csv('./dolly_ggltrans/dolly_ggltranslated.csv')