Building the dataset of numerical data

In [1]:
# Allows printing full text
import pandas as pd
pd.set_option('display.max_colwidth', None)


In [None]:
### PUT MAIN HERE ###

In [2]:
"""
SETUP
"""
### Import models
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### Import self-made functions
from CODE.data_preprocessing.split_val import split_val
from CODE.data_preprocessing.find_outliers_tukey import find_outliers_tukey
from CODE.features.length_title import length_title
from CODE.features.field_variety import field_variety2
#from CODE.features.field_variety import field_variety
from CODE.features.team_size import team_size
from CODE.features.topic_variety import topics_variety
from CODE.features.venue_frequency import venue_frequency
from CODE.features.age import age
#from CODE.features.author_database import author_database
#from CODE.features.author_name import author_name
from CODE.features.abst_words import abst_words

### Get the full train set:
data = pd.read_json('DATA/train-1.json')   # Numerical columns: 'year', 'references', 'citations'
test = pd.read_json('DATA/test.json')


"""
DEAL with missing values in "data" and "test" here - SELIN

doi --> ""
title --> ""
abstract --> "" 
authors --> [""]
venue --> ""
year --> mean of venue based on "data" ELSE from "data"
references --> 0  --think about this!
topic --> [""]
is_open-access --> base on venue ELSE from "data"
fields_of_study --> [""]
citations --> assume not blank

"""


### push the numerical columns to num_X
end = len(data)
num_X = data.loc[ 0:end+1 , ('doi', 'citations', 'year', 'references') ]  ##REMOVE DOI




"""
FEATURE DATAFRAME: num_X

ALL: After writing a funtion to create a feature, please incorporate your new feature as a column on the dataframe below.
This is the dataframe we will use to train the models.
"""

### use feature function to create a new variable
"""
DO NOT change the order in this section if at all possible
"""
title_len = length_title(data)      # returns: dictionary of lists: [doi](count)
field_var = field_variety2(data)    # returns: dictionary of lists: [doi](count)
team_sz = team_size(data)           # returns a numbered series
topic_var = topics_variety(data)    # returns a numbered series
venue_db, venues_reformatted = venue_frequency(data)  # returns a dictionary: [venue](count) and a pandas.Series of the 'venues' column reformatted 
num_X['venue'] = venues_reformatted # Dataframe needs a venue to deal with missing values
paper_age = age(data)               # returns a numbered series
open_access = pd.get_dummies(data["is_open_access"], drop_first = True)  # returns pd.df (True = 1)

keywords = ["method", "review", "randomized", "random control"]
abst_keywords = abst_words(data, keywords)   #returns a numbered series: 1 if any of the words is present in the abstract, else 0
"""
END do not reorder
"""


### join the variables (type = series) to num_X 
num_X['team_size'] = team_sz
num_X['topic_variety'] = topic_var
num_X['age'] = paper_age
num_X['open_access'] = open_access
num_X['has_keyword'] = abst_keywords
num_X['venue'] = venues_reformatted

### join the variables (type = dictionary) to num_X
num_X['title_length'] = num_X['doi'].map(title_len)
num_X['field_variety'] = num_X['doi'].map(field_var)


# Check venue and add venue_frequency to each paper
venue_freq = pd.Series(dtype=pd.Int64Dtype())
for index, i_paper in num_X.iterrows():
    venue_freq[index,] = venue_db[i_paper['venue']] 
num_X['venue_freq'] = venue_freq


### Drop columns containing just strings
num_X = num_X.drop(['venue', 'doi'], axis = 1)


## train/val split
X_train, X_val, y_train, y_val = split_val(num_X, target_variable = 'citations')


"""
INSERT outlier detection on X_train here - ALBERT
"""




"""
IMPLEMENT regression models fuctions here
- exponential
"""



'\nIMPLEMENT regression models fuctions here\n- exponential\n'

In [None]:
"""
-----------------------------------------------------------------------------------------------------------
------------------------------ LETS EXPLORE!!! ------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
"""
"""
"""

In [None]:
print(len(data))
print(type(abst_keywords))
print(abst_keywords)
#num_X

In [None]:
### FOR: exploring the scaffolding of the new dataframe for prediction as pulled from the full dataset

# print(type(data))
# print(list(data.columns))
# print("X type:", type(num_X), "X shape:", num_X.shape)
# data

In [None]:
### FOR: exploring the results of feature functions

print(type(title_len))
print(type(field_var))
print(type(team_sz))
print(type(topic_var))
print(type(venue_freq))
print(type(paper_age))
#title_len
#field_var
#team_sz
#topic_var
#venue_freq

In [3]:
### FOR: exploring the new dataframe with numerical columns

# from StackExchange:
# Never grow a DataFrame! It is always cheaper to append to a python list and then 
# convert it to a DataFrame at the end, both in terms of memory and performance.
# When appending to df, a new DataFrame is created each time in memory instead of 
# using the existing one, which is quite frankly a waste. It is always cheaper to 
# append to a python list and then convert it to a DataFrame at the end, both in 
# terms of memory and performance.

# --> NOTE: it would be more efficient to combine these first and only expand the df once (per addition type)

num_X

Unnamed: 0,citations,year,references,team_size,topic_variety,age,open_access,has_keyword,title_length,field_variety,venue_freq
0,60,2015.0,39,6,1,6.0,1,1,10,1,2005
1,1,2020.0,44,5,0,1.0,1,0,18,1,8
2,5,2017.0,30,3,5,4.0,1,0,8,1,116
3,5,2017.0,11,2,6,4.0,1,0,13,1,68
4,10,2015.0,26,2,23,6.0,1,0,5,1,30
...,...,...,...,...,...,...,...,...,...,...,...
9653,8,2014.0,25,4,11,7.0,1,1,10,1,9
9654,1,2019.0,18,4,3,2.0,1,1,7,1,462
9655,1,2021.0,12,2,0,0.0,0,0,16,1,12
9656,3,2021.0,15,4,0,0.0,0,0,15,4,9


In [None]:
### FOR: explore data train/val split  (should be 6470 train rows and 3188 validation rows)
# names: X_train, X_val, y_train, y_val
#X_train
#X_val
#y_train
#y_val


In [None]:
"""
-----------------------------------------------------------------------------------------------------------
------------------------- LETS CODE!!! --------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
"""
"""
"""

In [9]:
#X_train

In [10]:
#y_train

In [6]:
"""
Remove outliers
NOTE: can't rerun this code without restarting the kernal
"""
#names: X_train, X_val, y_train, y_val
#print(list(X_train.columns))

# print("citations:", find_outliers_tukey(x = y_train['citations'], top = 93, bottom = 0))

# print("year:", find_outliers_tukey(X_train['year'], top = 74, bottom = 25))  # seems unnecessary
# print("references:", find_outliers_tukey(X_train['references'], top = 90, bottom = 10))  # seems unnecessary
# print("team_size:", find_outliers_tukey(X_train['team_size'], top = 99, bottom = 0))  # Meh
# print("topic_variety:", find_outliers_tukey(X_train['topic_variety'], top = 75, bottom = 10))  # not much diff btw top and normal
# print("age:", find_outliers_tukey(X_train['age'], top = 90, bottom = 10))  # Meh
# print("open_access:", find_outliers_tukey(X_train['open_access'], top = 100, bottom = 0))  # Not necessary: boolean
# print("has_keyword:", find_outliers_tukey(X_train['has_keyword'], top = 100, bottom = 0))  # Not necessary: boolean
# print("title_length:", find_outliers_tukey(X_train['title_length'], top = 90, bottom = 10))  # Meh
# print("field_variety:", find_outliers_tukey(X_train['field_variety'], top = 90, bottom = 10))  # seems unnecessary
# print("venue_freq:", find_outliers_tukey(X_train['venue_freq'], top = 90, bottom = 10))  # seems unnecessary


out_y = (find_outliers_tukey(x = y_train['citations'], top = 93, bottom = 0))[0]
out_X = (find_outliers_tukey(x = X_train['team_size'], top = 99, bottom = 0))[0]
out_rows = out_y + out_X
out_rows = sorted(list(set(out_rows)))




X_train:
(6470, 10)
(6322, 10)

y_train:
(6470, 1)
(6322, 1)


In [None]:
# Python 3 code to demonstrate 
# removing duplicated from list 
# using set()
  
# initializing list
test_list = [1, 5, 3, 6, 3, 5, 6, 1]
print ("The original list is : " +  str(test_list))
  
# using set()
# to remove duplicated 
# from list 
test_list = list(set(test_list))
  
# printing list after removal 
# distorted ordering
print ("The list after removing duplicates : " + str(test_list))

In [None]:
"""
Look at some correlations
"""
# names: X_train, X_val, y_train, y_val

corr_mat = num_X.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')

In [None]:
# Basic regression model Using any continuous variables
#     Establish data
#     Define model: regression model: sklearn.linear_model.LinearRegression
#     Fit model
#     Predict
#     Evaluate

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
model = LinearRegression()



# 1. z-score
reg = model.fit(X = X_train, y = y_train)  # 2. fit model
print("Model weights:", reg.coef_)
print("Model intercept/bias:", reg.intercept_)
y_pred_val = model.predict(X_val)  # 3. predict
a = r2_score(y_val, y_pred_val)  # 4. evaluate
b = mean_absolute_error(y_val, y_pred_val)


In [None]:
# Create a mini version of the main 'data' dataframe

import pandas as pd
import numpy as np
# %pwd
# %cd C:\Users\r_noc\Desktop\Python\GIT\machinelearning
    
play = data.sample(100, replace = False, axis = 0)  


print(play.shape)
# print(play['abstract'])

print(list(play.columns))
# play['has_keyword'] = np.nan
# print(play.shape)
# play