Building the dataset of numerical data

In [4]:
### Import models
import pandas as pd
import numpy as np

### Import self-made functions
from CODE.features.length_title import length_title
from CODE.features.field_variety import field_variety2
#from CODE.features.field_variety import field_variety
from CODE.features.team_size import team_size
from CODE.features.topic_variety import topics_variety
from CODE.features.venue_frequency import venue_frequency

In [5]:
### Get the full train set:
data = pd.read_json('DATA/train-1.json')   # Numerical columns: 'year', 'references', 'citations'

### push the numerical columns to X and outcome to y
end = len(data)
num_X = data.loc[ 0:end+1 , ('doi', 'year', 'references') ]
num_y = data.loc[ 0:end+1 , ('citations') ]

# print(type(data))
# print(list(data.columns))
print("X type:", type(num_X), "X shape:", num_X.shape)
print("y type:", type(num_y), "y shape:", num_y.shape)
#data

X type: <class 'pandas.core.frame.DataFrame'> X shape: (9658, 3)
y type: <class 'pandas.core.series.Series'> y shape: (9658,)


In [8]:
### bring in other numerical features
title_len = length_title(data)  # returns: dictionary of lists: [doi](count)
field_var = field_variety2(data)  # returns: dictionary of lists: [doi](count)
team_sz = team_size(data) # returns a numbered series
topic_var = topics_variety(data) # returns a numbered series
venue_freq = venue_frequency(data) # returns a dictionary: [venue](count)

In [9]:
print(type(title_len))
print(type(field_var))
print(type(team_sz))
print(type(topic_var))
print(type(venue_freq))
#title_len
#field_var
#team_sz
#topic_var
#venue_freq

<class 'dict'>
<class 'dict'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'dict'>


In [20]:
# from StackExchange:
# Never grow a DataFrame! It is always cheaper to append to a python list and then 
# convert it to a DataFrame at the end, both in terms of memory and performance.
# When appending to df, a new DataFrame is created each time in memory instead of 
# using the existing one, which is quite frankly a waste. It is always cheaper to 
# append to a python list and then convert it to a DataFrame at the end, both in 
# terms of memory and performance.

### join the series to num_X
num_X['team_size'] = team_sz
num_X['topic_variety'] = topic_var

### join the dictionaries to num_X
num_X['title_length'] = num_X['doi'].map(title_len)
num_X['field_variety'] = num_X['doi'].map(field_var)

num_X
#title_len
#field_var

Unnamed: 0,doi,year,references,team_size,topic_variety,title_len,title_length,field_variety
0,10.3115/v1/P15-1039,2015.0,39,6,1,10,10,1
1,10.18653/v1/2020.eval4nlp-1.12,2020.0,44,5,0,18,18,1
2,10.18653/v1/W17-3516,2017.0,30,3,5,8,8,1
3,10.18653/v1/S17-2160,2017.0,11,2,6,13,13,1
4,10.18653/v1/W15-2205,2015.0,26,2,23,5,5,1
...,...,...,...,...,...,...,...,...
9653,10.3115/v1/W14-0202,2014.0,25,4,11,10,10,1
9654,10.26615/978-954-452-058-8_001,2019.0,18,4,3,7,7,1
9655,10.18653/V1/2021.SMM4H-1.16,2021.0,12,2,0,16,16,1
9656,10.18653/v1/2021.case-1.22,2021.0,15,4,0,15,15,4


In [None]:
# Returns dictionary [doi](title_len, )
import copy
num_vars_dict = copy.deepcopy(title_len)

for i in (num_vars_dict, field_var): # you can list as many input dicts as you want here
    for key, value in i.items():
        num_vars_dict[key].append(value)

num_vars_dict
