In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from statistics import mean
import regex as re
import csv
import os
import glob
from itertools import repeat

In [3]:
def get_file_names(filename):
    file_content = list()
    for i in sorted(glob.glob(filename)):
        file_content.append(i)
        
    return file_content

In [4]:
def get_split_data(filename):
    f = open(filename,'r')
    l = f.readlines()
    f.close()
    return l

In [10]:
def get_titles(title_files):
    title_list = list()
    for i in title_files:
        f = open(i,'r')
        title = f.read()
        title_list.append(title)
    return title_list

In [5]:
def format_single_data(data_files, title_files):
    input_format_list = list()
    for i,j in zip(data_files,title_files):
        temp_list = list()
        
        f2 = open(j,'r')
        title = f2.read()
        
        f1 = open(i,'r')
        
        dataX = list()
        dataY = list()
        
        for x , y in csv.reader(f1, delimiter=','):
            dataX.append(x)
            dataY.append(y)
        
        x_axis = dataX.pop(0)
        y_axis = dataY.pop(0)

        for x, y in zip(dataX, dataY):
            input_format = 'The ' + title + ' for ' + x_axis + ' ' + check_str(x) + ' is ' + check_str(y) + ' ' + y_axis
            temp_list.append(input_format)
        
        input_format_list.append('. '.join(temp_list))
    return input_format_list

In [6]:
def check_str(x):
    if isinstance(x,str):
        return x
    else:
        return str(x)

In [7]:
def format_multi_data(data_file,title_file):
    input_format_list = list()
    for i, j in zip(data_file,title_file):
        temp_list = list()
        
        f2 = open(j,'r')
        title = f2.read()
        
        df = pd.read_csv(i)
        headers = [[] for i in range(0, len(df.columns))]
        for i in range(len(df.columns)):
            headers[i] = list(df[df.columns[i]])
            
        columns = list(df.columns)
        zipped = list(zip(*headers))
        
        for idx, i in enumerate(zipped):
            z = zip(columns,i)
            z = list(z)
            temp_list.append('The ' + title + ' for ' + str(z[0][0]) + ' ' +str(z[0][1]))    
            z.pop(0)
            m = tuple(z)         
            for k,j in m: 
                temp_list.append('The ' + title + ' for ' + check_str(k) + ' is ' + check_str(j))
        input_format_list.append('. '.join(temp_list))

    return input_format_list

In [8]:
def get_captions(caption_files):
    summaries_list = list()
    for i in caption_files:
        f = open(i,'r')
        summary = f.read()
        summaries_list.append(summary)
    return summaries_list

In [15]:
single_titles_files = get_file_names('original/titles/*.txt')
single_data_files = get_file_names('original/data/*.csv')
single_caption_files = get_file_names('original/captions/*.txt')
single_titles = get_titles(single_titles_files)
formated_data = format_single_data(single_data_files, single_titles_files)
single_captions = get_captions(single_caption_files)
df_single = pd.DataFrame(list(zip(single_titles,formated_data, single_captions)), columns = ["Title", "Data","Summaries"])

In [23]:
multi_titles_files = get_file_names('original/multiColumn/titles/*.txt')
multi_data_files = get_file_names('original/multiColumn/data/*.csv')
multi_caption_files = get_file_names('original/multiColumn/captions/*.txt')
multi_titles = get_titles(multi_titles_files)
multi_formated_data = format_multi_data(multi_data_files,multi_titles_files)
multi_captions = get_captions(multi_caption_files)
df_multi = pd.DataFrame(list(zip(multi_titles,multi_formated_data,multi_captions)), columns = ["Title", "Data","Summaries"])

In [26]:
df_all = pd.concat([df_single,df_multi])
df_all = df_all.reset_index(drop=True)

In [27]:
train_summary = get_split_data('preprocessed/train/trainOriginalSummary.txt')
val_summary = get_split_data('preprocessed/valid/validOriginalSummary.txt')
test_summary = get_split_data('preprocessed/test/testOriginalSummary.txt')

In [28]:
train_titles = get_split_data('preprocessed/train/trainTitle.txt')
val_titles = get_split_data('preprocessed/valid/validTitle.txt')
test_titles = get_split_data('preprocessed/test/testTitle.txt')

In [29]:
summaries_pro = train_summary + val_summary + test_summary
titles_pro = train_titles + val_titles + test_titles

In [30]:
df_pro = pd.DataFrame(list(zip(titles_pro,summaries_pro)), columns = ["Title", "Summaries_processed"])
df_pro = df_pro.reset_index(drop=True)

In [31]:
df_combined = df_all.merge(df_pro, on='Title', how='inner', suffixes=['', '_'], indicator=True)

In [32]:
df_combined = df_combined.drop(['Title','Summaries','_merge'],1)

  df_combined = df_combined.drop(['Title','Summaries','_merge'],1)


In [45]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

In [46]:
train, test = train_test_split(df_combined, test_size=1 - train_ratio)
val, test = train_test_split(test, test_size=test_ratio/(test_ratio + validation_ratio))

In [47]:
train.to_csv('train_c2t_small_nli.csv', sep='\t', index=False)
val.to_csv('val_c2t_small.csv_nli', sep='\t', index=False)
test.to_csv('test_c2t_small_nli.csv', sep='\t', index=False)