In [1]:
import pandas as pd
import random
import os

In [2]:
paper_data = pd.read_csv('citeDataset.csv', encoding = 'utf-8')
img_data = pd.read_csv('onlineAnnoFormal.csv', encoding = 'utf-8')

color = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a']

# build the doi dictionary for imageData
imgDic = {}
for i in range(len(img_data)):
    doi = img_data.loc[i,'Paper DOI']
    if doi not in imgDic:
        dic = {}
        dic['thumb_url'] = img_data.loc[i,'thumb_url']
        dic['filename'] = img_data.loc[i,'filename']
        dic['dominant_color'] = str(color[random.randint(0,9)])
        imgDic[doi] = dic
    else:
        imgDic[doi]['thumb_url'] = str(imgDic[doi]['thumb_url']) + '; ' + str(img_data.loc[i,'thumb_url'])
        imgDic[doi]['filename'] = str(imgDic[doi]['filename']) + '; ' + str(img_data.loc[i,'filename'])
        imgDic[doi]['dominant_color'] = str(imgDic[doi]['dominant_color']) + '; ' + str(color[random.randint(0,9)])

for i in range(len(paper_data)):
    doi = paper_data.loc[i,'Paper DOI']
    if(doi in imgDic):
        paper_data.loc[i,'img_name'] = imgDic[doi]['filename']
        paper_data.loc[i,'img_thumb_url'] = imgDic[doi]['thumb_url']
        paper_data.loc[i,'img_dcolor'] = imgDic[doi]['dominant_color']
        
paper_data.to_csv('result.csv', index=False, encoding='utf-8')        

###### create image dataset meta information

1. traverse the directories that contain image for each year 1990-2019
2. for each year, get the list of all image, sort by index
3. for each image, find its meta information from txt file and paper dataset
4. create the image dataset

> structure of image dataset

The image dataset contains the following attributes:

* filename
* Conference
* Year
* Number of Figures
* Paper_Title
* Paper_DOI
* Author
* paper_url
* Paper type  //J,C,M,S
* Keywords Author
* thumb_url
* url
* image_size    //size of visualization
* vis_type      //14 basic visualization type
* image_proportion   //the proportion of a vis in a paper page
* image_dcolor    //dominant color of a visualization
* image_colordis  //color distribution of an image
* year_index      //index of image in each year

paper_data = pd.read_csv('citeData.csv', encoding = 'utf-8')

In [5]:
def parseImageMeta(filename, year):
    '''
    parse the meta information of model
    '''
    try:
        f = open(filename, 'r') 
        content=f.readlines() 
        table_list = [] # store the image meta info
        for index, item in enumerate(content):
            row = item.lstrip()
            # check Vis papers
            if(row.startswith('Vis') or row.startswith('InfoVis') or row.startswith('SciVis')\
              or row.startswith('VAST')):
                # if contains image
                if('ImgIdx' in row):
                    # extract image meta information
                    imgInfo = row.split('ImgIdx')
                    '''
                    A typical example:
                    imgInfo[0]: paper_doi
                    imgInfo[n]: the information of nth image
                    ['Vis, Vis-10.1109-VISUAL.1990.146419-p449-C_2, ImageSize,1701,2202,', 
                    ',381,firstPage,449,Size,524,417,0.058337,(LTRBType),920,193,1444,610,6,', 
                    ',382,firstPage,449,Size,1191,439,0.139590,(LTRBType),255,192,1446,631,2,', 
                    ',383,firstPage,449,Size,432,555,0.064011,(LTRBType),312,728,744,1283,2,', 
                    ',384,firstPage,449,Size,437,559,0.065219,(LTRBType),309,727,746,1286,11,\n']
                    '''
                    # paper information
                    paperInfo = imgInfo[0].split(', ')[1].split('-')
                    conf = paperInfo[0]
                    page_number = paperInfo[4][2:]
                    paperDoi = str(paperInfo[1]) + '/' + str(paperInfo[2])
                    numImg = len(imgInfo)
                    for i in range(1,numImg):
                        img = imgInfo[i].split(',')
                        #print(img)
                        table_dic = {}
                        # index
                        table_dic['year-index'] = img[1]
                        firstPage = img[3]
                        image_name = str(conf) + '.' + str(firstPage) + '.' + str(img[1]) + '.jpg'
                        table_dic['filename'] = image_name
                        table_dic['sizeW'] = img[5]
                        table_dic['sizeH'] = img[6]
                        table_dic['image_proportion'] = img[7]
                        table_dic['vis_type'] = img[13]
                        table_dic['Conference'] = conf
                        table_dic['Year'] = year
                        table_dic['pageNum'] = page_number
                        table_dic['Paper DOI'] = paperDoi
                        table_dic['thumb_url'] = "http://web.cse.ohio-state.edu/~li.8950/data/image/vispubimg/" + str(year) + 'thumb/' + str(image_name)
                        table_dic['url'] = "http://web.cse.ohio-state.edu/~li.8950/data/image/vispubimg/" + str(year) + '/' + str(image_name)
                        table_list.append(table_dic)
            
        
        df = pd.DataFrame(table_list)
        return df
        
    finally:
        if f:
            f.close()

def createPaperDic(paperData):
    '''
    build the paper meta dictionary
    '''
    paperDic = {}
    for i in range(len(paperData)):
        doi = paperData.loc[i,'Paper DOI']
        infoDic = {}
        infoDic['Author'] = paperData.loc[i,'Author Names']
        infoDic['paper_type'] = paperData.loc[i,'Paper type: C=conference paper, J = journal paper, M=miscellaneous (capstone, keynote, VAST challenge, panel, poster, ...)']
        infoDic['paper_title'] = paperData.loc[i,'Paper Title']
        infoDic['keywords'] = paperData.loc[i,'Author Keywords']
        infoDic['paper_url'] = paperData.loc[i,'Link']
        paperDic[doi] = infoDic
    return paperDic
    
def concatPaperMeta(imgData, paperDic):
    '''
    concat image data with paper
    '''
    for i in range(len(imgData)):
        doi = imgData.loc[i,'Paper DOI']
        imgData.loc[i, 'Paper type'] = paperDic[doi]['paper_type']
        imgData.loc[i, 'Paper Title'] = paperDic[doi]['paper_title']
        imgData.loc[i, 'Author'] = paperDic[doi]['Author']
        imgData.loc[i, 'Keywords Author'] = paperDic[doi]['keywords']
        imgData.loc[i, 'paper_url'] = paperDic[doi]['paper_url']
    return imgData

for year in range(1990, 2020):
    meta_path = 'images/' + str(year) + '/TextImagePagesAndAnno.' + str(year) + '.txt'
    if(os.path.exists(meta_path)):
        
        df = parseImageMeta(meta_path, str(year))
        print(year)
        if(str(year) == '1990'):
            whole_df = df
        else:
            frames = [whole_df, df]
            whole_df = pd.concat(frames, ignore_index=True)

paperDic = createPaperDic(paper_data)
concat_res = concatPaperMeta(whole_df, paperDic)

1990
1995
2000
2006
2010
2015
2019


In [6]:
concat_res.to_csv('newImagedata.csv',index=False)

In [None]:
meta_path = 'images/' + str(2015) + '/TextImagePagesAndAnno.' + str(2015) + '.txt'