Building the dataset of numerical data

In [None]:
#### STOP - ONLY if needed
# Allows printing full text
import pandas as pd
pd.set_option('display.max_colwidth', None)


In [None]:
### PUT MAIN HERE ###

In [1]:
# Machine Learning Challenge
# Course: Machine Learning (880083-M-6)
# Group 58
 
##########################################
#             Import packages            #
##########################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import yake  #NOTE: with Anaconda: conda install -c conda-forge yake

##########################################
#      Import self-made functions        #
##########################################
from CODE.data_preprocessing.split_val import split_val
from CODE.data_preprocessing.find_outliers_tukey import find_outliers_tukey

#feature based on the title of the paper
from CODE.features.length_title import length_title

# features based on 'field_of_study' column 
from CODE.features.field_variety import field_variety         
from CODE.features.field_popularity import field_popularity
from CODE.features.field_citations_avarage import field_citations_avarage 

# features based on the topics of the paper
from CODE.features.topic_citations_avarage import topic_citations_avarage
from CODE.features.topic_variety import topics_variety
from CODE.features.topic_popularity import topic_popularity
from CODE.features.topic_citations_avarage import topic_citations_avarage

# features based on the abstract of the paper
from CODE.features.keywords import best_keywords

# features based on the venue of the paper
from CODE.features.venue_popularity import venue_popularity
from CODE.features.venue_citations import venues_citations

from CODE.features.age import age
from CODE.features.abst_words import abst_words

# features based on the authors of the paper
from CODE.features.author_h_index import author_h_index
from CODE.features.paper_h_index import paper_h_index
from CODE.features.team_size import team_size
from CODE.features.author_database import author_database


##########################################
#              Load datasets             #
##########################################
# Main datasets
data = pd.read_json('DATA/train.json')      # Training set
test = pd.read_json('DATA/test.json')       # Test set

# Author-centric datasets
#   These datasets were made using our self-made functions 'citations_per_author' (for the author_citation_dic)
#   These functions took a long time to make (ballpark ~10 minutes on a laptop in 'silent mode'), so instead we 
#   decided to run this function once, save the data, and reload the datasets instead of running the function again. 
import pickle
with open('my_dataset1.pickle', 'rb') as dataset:
    author_citation_dic = pickle.load(dataset)
with open('my_dataset2.pickle', 'rb') as dataset2:
    author_db = pickle.load(dataset2)


##########################################
#        Missing values handling         #
##########################################

# Missing values for feature 'fields_of_study'
data.loc[data['fields_of_study'].isnull(), 'fields_of_study'] = ""

# Missing values for feature 'title'
data.loc[data['title'].isnull(), 'title'] = ""

# Missing values for feature 'abstract'
data.loc[data['abstract'].isnull(), 'abstract'] = ""
    
# Missing values for features 'authors'
data.loc[data['authors'].isnull(), 'authors'] = ""

# Missing values for feature 'venue'
data.loc[data['venue'].isnull(), 'venue'] = ""
    
# Missing values for feature 'year'
# data.loc[data['fields_of_study'].isnull(), 'fields_of_study'] = mean(year) 
        #   Take mean by venue instead
        #       If venue not known, take something else?

# Missing values for feature 'references'
data.loc[data['references'].isnull(), 'references'] = ""

# Missing values for feature 'topics'
data.loc[data['topics'].isnull(), 'topics'] = ""

# Missing values for feature 'is_open_access'
#data.loc[data['is_open_access'].isnull(), 'is_open_access'] = "" 
        #   Take most frequent occurrence for venue
        #       If venue not known, do something else?
    
##########################################
#       Create basic numeric df          #
##########################################
end = len(data)
num_X = data.loc[ 0:end+1 , ('doi', 'citations', 'year', 'references') ]  ##REMOVE DOI


##########################################
#            Feature creation            #
##########################################
"""
FEATURE DATAFRAME: num_X

ALL: After writing a funtion to create a feature, please incorporate your new feature as a column on the dataframe below.
This is the dataframe we will use to train the models.

DO NOT change the order in this section if at all possible
"""
num_X['title_length'] = length_title(data)      # returns a numbered series
num_X['field_variety'] = field_variety(data)    # returns a numbered series 
num_X['field_popularity'] = field_popularity(data) # returns a numbered series
#num_X['field_citations_avarage'] = field_citations_avarage(data) # returns a numbered series
num_X['team_sz'] = team_size(data)           # returns a numbered series
num_X['topic_var'] = topics_variety(data)    # returns a numbered series
num_X['topic_popularity'] = topic_popularity(data) # returns a numbered series
num_X['topic_citations_avarage'] = topic_citations_avarage(data) # returns a numbered series
num_X['venue_popularity'], num_X['venue'] = venue_popularity(data)  # returns a numbered series and a pandas.Series of the 'venues' column reformatted 
num_X['open_access'] = pd.get_dummies(data["is_open_access"], drop_first = True)  # returns pd.df (True = 1)
num_X['age'] = age(data)               # returns a numbered series. Needs to be called upon AFTER the venues have been reformed (from venue_frequency)
num_X['venPresL'] = venues_citations(data)   # returns a numbered series. Needs to be called upon AFTER the venues have been reformed (from venue_frequency)
keywords = best_keywords(data, 1, 0.954, 0.955)    # from [data set] get [integer] keywords from papers btw [lower bound] and [upper bound] quantiles; returns list
num_X['has_keyword'] = abst_words(data, keywords)#returns a numbered series: 1 if any of the words is present in the abstract, else 0

# Author H-index
author_db, reformatted_authors = author_database(data)
data['authors'] = reformatted_authors
num_X['h_index'] = paper_h_index(data, author_citation_dic) # Returns a numbered series. Must come after author names have been reformatted.

field_avg_cit = num_X.groupby('field_variety').citations.mean()
for field, field_avg in zip(field_avg_cit.index, field_avg_cit):
    num_X.loc[num_X['field_variety'] == field, 'field_cit'] = field_avg


"""
END do not reorder
"""

##########################################
#    Deal with specific missing values   #
##########################################
# Open_access, thanks to jreback (27th of July 2016) https://github.com/pandas-dev/pandas/issues/13809
OpAc_by_venue = num_X.groupby('venue').open_access.apply(lambda x: x.mode()) # Take mode for each venue
OpAc_by_venue = OpAc_by_venue.to_dict()
missing_OpAc = num_X.loc[num_X['open_access'].isnull(),]
for i, i_paper in missing_OpAc.iterrows():
    venue = i_paper['venue']
    doi = i_paper['doi']
    index = num_X[num_X['doi'] == doi].index[0]
    if venue in OpAc_by_venue.keys():   # If a known venue, append the most frequent value for that venue
        num_X[num_X['doi'] == doi]['open_access'] = OpAc_by_venue[venue] # Set most frequent occurrence 
    else:                               # Else take most occurring value in entire dataset
        num_X.loc[index,'open_access'] = num_X.open_access.mode()[0] # Thanks to BENY (2nd of February, 2018) https://stackoverflow.com/questions/48590268/pandas-get-the-most-frequent-values-of-a-column

### Drop columns containing just strings
num_X = num_X.drop(['venue', 'doi', 'field_variety'], axis = 1)
num_X = num_X.dropna()


##########################################
#            Train/val split             #
##########################################

## train/val split
X_train, X_val, y_train, y_val = split_val(num_X, target_variable = 'citations')


"""
INSERT outlier detection on X_train here - ALBERT
"""

##########################################
#            Outlier detection           #
##########################################
### MODEL code for outlier detection
### names: X_train, X_val, y_train, y_val

# print(list(X_train.columns))

out_y = (find_outliers_tukey(x = y_train['citations'], top = 93, bottom = 0))[0]
out_rows = out_y

# out_X = (find_outliers_tukey(x = X_train['team_sz'], top = 99, bottom = 0))[0]
# out_rows = out_y + out_X

out_rows = sorted(list(set(out_rows)))

# print("X_train:")
# print(X_train.shape)
X_train = X_train.drop(labels = out_rows)
# print(X_train.shape)
# print()
# print("y_train:")
# print(y_train.shape)
y_train = y_train.drop(labels = out_rows)
# print(y_train.shape)

# Potential features to get rid of: team_sz


##########################################
#         Model implementations          #
##########################################
"""
IMPLEMENT regression models fuctions here
- exponential
"""

#model.fit(X_train, y_train)
#print('Best score: ', model.best_score_)
#print('Best parameters: ', model.best_params_)
#y_pred = model.predict(X_val)

#from sklearn.metrics import r2_score
#print(r2_score(y_val,y_pred))


# import json
#with open("sample.json", "w") as outfile:
    #json.dump(dictionary, outfile)

'\nIMPLEMENT regression models fuctions here\n- exponential\n'

In [None]:
"""
-----------------------------------------------------------------------------------------------------------
------------------------------ LETS EXPLORE!!! ------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
"""
"""
"""

In [2]:
### FOR: exploring the new dataframe with numerical columns
# --> NOTE: it would be more efficient to combine these first and only expand the df once (per addition type)

num_X

Unnamed: 0,citations,year,references,title_length,field_popularity,team_sz,topic_var,topic_popularity,topic_citations_avarage,venue_popularity,open_access,age,venPresL,has_keyword,h_index,field_cit
0,60,2015.0,39,10.0,9394,6,1,75,45.186667,2005,1,6.0,70.793257,1,16.0,36.829665
1,1,2020.0,44,18.0,9394,5,0,10,4.134021,8,1,1.0,3.75,1,1.0,36.829665
2,5,2017.0,30,8.0,9394,3,5,344,43.519941,116,1,4.0,12.702479,1,6.0,36.829665
3,5,2017.0,11,13.0,9394,2,6,1019,57.829761,68,1,4.0,10.797101,1,5.0,36.829665
4,10,2015.0,26,5.0,9394,2,23,1131,60.718362,30,1,6.0,4.064516,1,3.0,36.829665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9653,8,2014.0,25,10.0,9394,4,11,230,57.869379,9,1,7.0,3.444444,1,5.0,36.829665
9654,1,2019.0,18,7.0,9394,4,3,186,50.620347,462,1,2.0,9.042857,1,4.0,36.829665
9655,1,2021.0,12,16.0,9394,2,0,10,4.134021,12,0,0.0,1.166667,1,1.0,36.829665
9656,3,2021.0,15,15.0,586,4,0,10,4.134021,9,0,0.0,1.777778,1,2.0,2.764706


In [3]:
### FOR: explore data train/val split  (should be 6470 train rows and 3188 validation rows)
# names: X_train, X_val, y_train, y_val
print("number of keywords:", len(keywords))
print("total train rows:", X_train.shape)
print("numer w keyword:", sum(X_train['has_keyword']))
print()
print(keywords)

#X_val
#y_train
#y_val
#6210 of 6313
#6136 (of 6313) for 1 keyword from the top 1% of papers
#4787 for 2 keywords from top .01% of papers (correlation: 0.036)
#2917 for 1 keyword from top .01% of papers (correlation: 0.049)

number of keywords: 272
total train rows: (6321, 15)
numer w keyword: 6223

{'algorithms', 'xnli', 'vertices', 'honorifics', 'eda', 'entity-clusters', 'style', 'tagging', 'cortana', 'generation', 'goal', 'texts', 'systems', 'grammar', 'detector', 'inference', 'web', 'representations', 'quinlan', 'speedometer', 'vectors', 'models', 'collins', 'knowledge', 'drying', 'question', 'tasks', 'embeddings', 'freebase', 'system', 'scientific', 'ebmt', 'patterns', 'wsj', 'paradise', 'lfg', 'social', 'relations', 'statistical', 'entities', 'phora', 'astd', 'berger', 'significance', 'siamese', 'transliteration', 'mrc', 'abstractive', 'structural', 'czech', 'tuples', 'knowledge-base', 'code-mixing', 'architecture', 'media', 'errors', 'stanford', 'propagation', 'sense', 'historically', 'search', 'coreference', 'document', 'self-attentional', 'polarity', 'nlg', 'dialogpt', 'languages', 'text-level', 'embodied', 'classifiers', 'beam', 'eua', 'parallel', 'asian', 'features', 'clinical', 'model', 'techni

In [None]:
"""
Look at some correlations - full num_X
"""
# names: X_train, X_val, y_train, y_val

# From: https://www.kaggle.com/ankitjha/comparing-regression-models
import seaborn as sns
corr_mat = num_X.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')


In [None]:
"""
Look at some correlations - X_train
"""
# names: X_train, X_val, y_train, y_val

#temp = y_train hstack X_train


# From: https://www.kaggle.com/ankitjha/comparing-regression-models
corr_mat = X_train.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')

In [None]:
"""
-----------------------------------------------------------------------------------------------------------
------------------------- LETS CODE!!! --------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------
"""
"""
"""

In [None]:
X_train

In [None]:
"""
Remove outliers
NOTE: can't rerun this code without restarting the kernal
"""
#names: X_train, X_val, y_train, y_val
#print(list(X_train.columns))

# print("citations:", find_outliers_tukey(x = y_train['citations'], top = 93, bottom = 0))

# print("year:", find_outliers_tukey(X_train['year'], top = 74, bottom = 25))  # seems unnecessary
# print("references:", find_outliers_tukey(X_train['references'], top = 90, bottom = 10))  # seems unnecessary
# print("team_size:", find_outliers_tukey(X_train['team_size'], top = 99, bottom = 0))  # Meh
# print("topic_variety:", find_outliers_tukey(X_train['topic_variety'], top = 75, bottom = 10))  # not much diff btw top and normal
# print("age:", find_outliers_tukey(X_train['age'], top = 90, bottom = 10))  # Meh
# print("open_access:", find_outliers_tukey(X_train['open_access'], top = 100, bottom = 0))  # Not necessary: boolean
# print("has_keyword:", find_outliers_tukey(X_train['has_keyword'], top = 100, bottom = 0))  # Not necessary: boolean
# print("title_length:", find_outliers_tukey(X_train['title_length'], top = 90, bottom = 10))  # Meh
# print("field_variety:", find_outliers_tukey(X_train['field_variety'], top = 90, bottom = 10))  # seems unnecessary
# print("venue_freq:", find_outliers_tukey(X_train['venue_freq'], top = 90, bottom = 10))  # seems unnecessary


out_y = (find_outliers_tukey(x = y_train['citations'], top = 95, bottom = 0))[0]
#out_X = (find_outliers_tukey(x = X_train['team_size'], top = 99, bottom = 0))[0]
out_rows = out_y
#out_rows = out_y + out_X
out_rows = sorted(list(set(out_rows)))

print("X_train:")
print(X_train.shape)
X_train = X_train.drop(labels = out_rows)
print(X_train.shape)
print()
print("y_train:")
print(y_train.shape)
y_train = y_train.drop(labels = out_rows)
print(y_train.shape)




In [None]:
X_train

In [None]:
# Create a mini version of the main 'data' dataframe

import pandas as pd
import numpy as np
# %pwd
# %cd C:\Users\r_noc\Desktop\Python\GIT\machinelearning
    
play = data.sample(100, replace = False, axis = 0, random_state = 123)  


print(play.shape)
# print(play['abstract'])

print(list(play.columns))
# play['has_keyword'] = np.nan
# print(play.shape)
# play

In [None]:
from sklearn.linear_model import PoissonRegressor



















In [19]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_z = scaler.fit_transform(X_train)
X_val_z  =scaler.transform(X_val)

polynomial_features = PolynomialFeatures(degree = 2)
x_train_poly = polynomial_features.fit_transform(X_train_z)
x_val_poly = polynomial_features.transform(X_val_z)

model = LinearRegression()
model.fit(x_train_poly, y_train)
y_poly_pred = model.predict(x_val_poly)

print(r2_score(y_val, y_poly_pred))   # -0.04350391168707901
print(mean_absolute_error(y_val, y_poly_pred))    # 32.65668266590838

-0.04350391168707901
32.65668266590838


In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_z = scaler.fit_transform(X_train)
X_val_z  =scaler.transform(X_val)

model = PolynomialFeatures(degree = 2)
X_poly = model.fit_transform(X_train_z)
model.fit(X_poly, y_train)
model2 = LinearRegression()
model2.fit(X_poly, y_train)

y_pred_val = model2.predict(model.fit_transform(X_val_z))

print(r2_score(y_val, y_pred_val))   #0.03724015197555319
print(mean_absolute_error(y_val, y_pred_val))   #33.38996938585591

-0.04350391168707901
32.65668266590838


In [7]:
#names: X_train, X_val, y_train, y_val

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

scaler = StandardScaler()
X_train_z = scaler.fit_transform(X_train)
X_val_z  =scaler.transform(X_val)
y_ravel = np.ravel(y_train)
lr = [ 1, .1, .01, .001, .0001]
settings = []
for learning_rate in ['constant', 'optimal', 'invscaling']:
    for loss in ['squared_error', 'huber']:
        for eta0 in lr:
            model = SGDRegressor(learning_rate=learning_rate, eta0=eta0, loss=loss,random_state=666, max_iter=5000)
            model.fit(X_train_z, y_ravel)
            y_pred = model.predict(X_val_z)
            
            mae = mean_absolute_error(y_val, y_pred)
            r2 =  r2_score(y_val, y_pred)
            settings.append((learning_rate, eta0, loss, mae, r2))
            print(settings[-1])

# Best outcome: ('constant', 0.01, 'squared_error', 35.74249957361433, 0.04476790061780822)

('constant', 1, 'squared_error', 7986319302234.596, -1.1689781971414692e+22)
('constant', 0.1, 'squared_error', 597818678529.5054, -5.814303490731018e+19)
('constant', 0.01, 'squared_error', 35.74249957361433, 0.04476790061780822)
('constant', 0.001, 'squared_error', 33.96759613563317, 0.04177061028029283)
('constant', 0.0001, 'squared_error', 33.37140461494136, 0.03709359043989702)
('constant', 1, 'huber', 31.672696372050158, 0.0010754141419426766)
('constant', 0.1, 'huber', 31.573004639095345, -0.0015847707052509818)
('constant', 0.01, 'huber', 31.54910993429202, -0.005266820462498822)
('constant', 0.001, 'huber', 31.703266890422977, -0.0130983313436015)
('constant', 0.0001, 'huber', 32.89348462159435, -0.03229190540683824)
('optimal', 1, 'squared_error', 48.77480568326134, -0.15604683726395585)
('optimal', 0.1, 'squared_error', 48.77480568326134, -0.15604683726395585)
('optimal', 0.01, 'squared_error', 48.77480568326134, -0.15604683726395585)
('optimal', 0.001, 'squared_error', 48.7

In [4]:
#names: X_train, X_val, y_train, y_val

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

model = LinearRegression()
reg = model.fit(X = X_train, y = y_train)
y_pred_val = model.predict(X_val)

print(r2_score(y_val, y_pred_val))   #0.03724015197555319
print(mean_absolute_error(y_val, y_pred_val))   #33.38996938585591

0.03717991239578533
33.37534194713595


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)

y_ravel = np.ravel(y_train)

model = LogisticRegression(random_state = 123, max_iter = 2000)
reg = model.fit(X = X_train_s, y = y_ravel)
y_pred_val = model.predict(X_val_s)

print(r2_score(y_val, y_pred_val))   # 0.006551953988217396
print(mean_absolute_error(y_val, y_pred_val))    # 34.07342328208346

0.006095817169994966
34.11515531848133
