In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from statistics import mean
import regex as re
import csv
import os
import glob

In [43]:
def get_file_names(data_file, caption_file,title_file):
    data_files = list()
    caption_files = list()
    title_files = list()
    for i in sorted(glob.glob(data_file)):
        data_files.append(i)
        
    for i in sorted(glob.glob(caption_file)):
        caption_files.append(i)
    
    for i in sorted(glob.glob(title_file)):
        title_files.append(i)
    
    return data_files, caption_files, title_files

In [44]:
def format_single_data(data_files):
    input_format_list = list()
    for i in data_files:
        f = open(i,'r')
        
        dataX = list()
        dataY = list()
        
        for x , y in csv.reader(f, delimiter=','):
            dataX.append(x)
            dataY.append(y)
        
        x_axis = dataX.pop(0)
        y_axis = dataY.pop(0)
        
        t = list(zip(dataX,dataY))
        
        res = [' '.join(tups) for tups in t]
        input_format = ' x-y labels ' + x_axis + ' - '+ y_axis + ', x-y values ' + ' , '.join(res)
        input_format_list.append(input_format)
    return input_format_list

In [45]:
def format_multi_data(data_files):
    input_format_list = list()
    for i in data_files:
        df = pd.read_csv(i)
        headers = [[] for i in range(0, len(df.columns))]
        for i in range(len(df.columns)):
            headers[i] = list(df[df.columns[i]])
        zipped = list(zip(*headers))
        res = [' '.join(map(str,tups)) for tups in zipped]
        input_format = ' labels ' + ' - '.join(list(df.columns)) +  ' values '  + ' , '.join(res)
        input_format_list.append(input_format)

    return input_format_list

In [46]:
def get_captions(caption_files):
    summaries_list = list()
    for i in caption_files:
        f = open(i,'r')
        summary = f.read()
        summaries_list.append(summary)
    return summaries_list

In [47]:
def get_titles(title_files):
    title_list = list()
    for i in title_files:
        f = open(i,'r')
        title = f.read()
        title_list.append(title)
    return title_list

In [48]:
def combine_title_data(data_formated, titles):
    title_data = list()
    for x,y in zip(titles,data_formated):
        l = ' '.join([x,y])
        title_data.append(l)
    return title_data

In [49]:
data_files, caption_files,title_files = get_file_names('data/*.csv','captions/*.txt','titles/*.txt')
data_formated_list = format_single_data(data_files)
summaries_list = get_captions(caption_files)
titles_list = get_titles(title_files)
data_title_list = combine_title_data(data_formated_list,titles_list)
df_data_single = pd.DataFrame(list(zip(data_title_list,summaries_list)), columns = ["Data", "Summaries"])

In [50]:
data_files, caption_files,title_files = get_file_names('multiColumn/data/*.csv','multiColumn/captions/*.txt','multiColumn/titles/*.txt')
data_formated_list = format_multi_data(data_files)
summaries_list = get_captions(caption_files)
titles_list = get_titles(title_files)
data_title_list = combine_title_data(data_formated_list,titles_list)
df_data_multi = pd.DataFrame(list(zip(data_title_list,summaries_list)), columns = ["Data", "Summaries"])

In [32]:
df_data_multi[0]

Unnamed: 0,Data,Summaries
0,Resident population of the United States by ra...,This graph shows the population of the U.S. by...
1,Number of the coronavirus ( COVID-19 ) cases a...,The south-western Indian state of Maharashtra ...
2,Average number of deaths from COVID-19 over th...,The seven-day average number of COVID-19 death...
3,Distribution of new confirmed coronavirus ( CO...,"Out of 6,937 new confirmed COVID-19 patients i..."
4,Preferred mobile messenger apps of smartphone ...,This statistic presents the preferred mobile m...
...,...,...
6938,Number of electoral votes from Georgia designa...,"As of 2020 , Georgia has taken part in every p..."
6939,"Employment rate in Sweden from 2009 to 2019 , ...","In 2010 , the employment rate in Sweden was do..."
6940,Value of motor vehicles imported and exported ...,The value of motor vehicle trading reached its...
6941,Digital and print advertising revenue of U.S. ...,The statistic shows annual digital and print a...


In [51]:
df_all = pd.concat([df_data_single,df_data_multi])

In [52]:
df_all = df_all.reset_index(drop=True)

In [57]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

In [59]:
train, test = train_test_split(df_all, test_size=1 - train_ratio)

In [61]:
train

Unnamed: 0,Data,Summaries
30230,Gold medal winning times in the Men 's and Wom...,The men 's 800m is classified as a middle-dist...
32475,Percentage of coronavirus ( COVID-19 ) deaths ...,"As of June 23 , 2020 , 54 percent of deaths du..."
31168,Mexico : Distribution of the gross domestic pr...,This statistic shows the distribution of the g...
15293,Value of U.S. orders for metal forming and fab...,This chart shows the total value of U.S. order...
5757,Bangladesh : Gross domestic product ( GDP ) in...,The gross domestic product ( GDP ) of Banglade...
...,...,...
13993,Leading ten analgesic drugs dispensed in Engla...,This statistic displays the leading ten analge...
3643,Monthly cost of a Rappi Prime membership in La...,"The Colombian unicorn company , Rappi , offers..."
29626,Breakdown of U.S. online advertising revenue f...,The statistic above shows a breakdown of U.S. ...
33208,Number of user data requests issued to Faceboo...,"In the first half of 2020 , Facebook received ..."


In [62]:
val, test = train_test_split(test, test_size=test_ratio/(test_ratio + validation_ratio)) 

In [63]:
val

Unnamed: 0,Data,Summaries
12342,Female victims of homicide in England and Wale...,This statistic shows female victims ( 16 years...
24164,Change in market value of the United States ho...,The statistic shows the change in the U.S. hot...
25232,Number of hostages taken by terrorists in 2010...,The statistic shows the number of hostages tak...
32019,Gross value added of the creative industries i...,This statistic illustrates the gross value add...
15566,Venezuela : Life expectancy at birth from 2008...,This statistic shows the life expectancy at bi...
...,...,...
29912,Net sales of Tiffany and Co. worldwide from 20...,This timeline shows the net sales of Tiffany a...
8647,U.S. petroleum imports from Russia from 2000 t...,This statistic represents petroleum imports fr...
10061,Availability of debt for financing development...,This statistic shows the change in availabilit...
18252,Liechtenstein : Gross domestic product ( GDP )...,The statistic shows gross domestic product ( G...


In [64]:
test

Unnamed: 0,Data,Summaries
22155,Most popular news brands in the United States ...,This statistic gives information on the most p...
860,New York Giants all-time rushing leaders from ...,The statistic shows New York Giants players wi...
22491,Australia : Employment from 2010 to 2020 ( in ...,This statistic shows the employment in Austral...
23491,"Number of employees at Beiersdorf AG in 2019 ,...",This statistic shows the number of people empl...
7139,Urban population in Cambodia from 2011 to 2020...,"In 2020 , the urban population in Cambodia sto..."
...,...,...
8645,Forecast impact of the novel coronavirus ( COV...,"As of November 27 , 2020 , the gross domestic ..."
33264,Fintech and big-data employment figures in the...,This statistic shows the estimated employment ...
5130,Countries most affected by banking trojans and...,This statistic shows a ranking of the countrie...
7406,Unemployment rateÂ in CuraÃ§ao from 2014 to 20...,This statistic displays the unemployment rate ...


In [65]:
train.to_csv('train_c2t_big.csv', sep='\t', index=False)
val.to_csv('val_c2t_big.csv', sep='\t', index=False)
test.to_csv('test_c2t_big.csv', sep='\t', index=False)