In [1]:
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from scipy.stats import pearsonr
from lightgbm import LGBMRegressor as LGBM

  from pandas import MultiIndex, Int64Index


In [80]:
input = pd.read_csv("sampled_cite_input.csv")
target = pd.read_csv("sampled_cite_target.csv").iloc[:,1:]

In [188]:
print(input.shape)
print(target.shape)

(14197, 4411)
(14197, 140)


In [22]:
# Run PCA
p = 500

input_sd = StandardScaler().fit_transform(input)
input_pca = PCA(n_components=p)
X = pd.DataFrame(input_pca.fit_transform(input_sd))
print(X.shape)

In [189]:
y = target.iloc[:,:]
print(y.shape)

(14197, 140)


In [123]:
# train test split
test_size = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(10647, 500) (3550, 500) (10647, 140) (3550, 140)


In [187]:
# fit the model
a = 0.1
# model = Lasso(alpha=a)
# model = Ridge(alpha=a)
# model = RFR(n_estimators=10, max_depth=5)
# model = XGBR(max_depth=5, learning_rate=0.1, n_estimators=20)
# model = LGBM(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20))
# model.fit(X_train, y_train)


In [149]:
# Lasso
model = Lasso(alpha=a)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_mse = mse(y_test, y_pred)
test_pear = pearsonr(y_test.to_numpy().flatten(), y_pred.flatten())
print("MSE of the test set:", test_mse)
print("Pearson of the test set:", test_pear[0])

MSE of the test set: 2.8803016181762024
Pearson of the test set: 0.8676670120732592


In [150]:
# Ridge
model = Ridge(alpha=a)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_mse = mse(y_test, y_pred)
test_pear = pearsonr(y_test.to_numpy().flatten(), y_pred.flatten())
print("MSE of the test set:", test_mse)
print("Pearson of the test set:", test_pear[0])

MSE of the test set: 2.963454733984181
Pearson of the test set: 0.8636707792964627


In [151]:
# Random Forest Regressor
model = RFR(n_estimators=10, max_depth=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_mse = mse(y_test, y_pred)
test_pear = pearsonr(y_test.to_numpy().flatten(), y_pred.flatten())
print("MSE of the test set:", test_mse)
print("Pearson of the test set:", test_pear[0])

MSE of the test set: 3.2406228916576416
Pearson of the test set: 0.8496626252995269


In [186]:
# XGBoost Regressor
model = XGBR(max_depth=3, learning_rate=0.1, n_estimators=10)
y_preds = np.array([])
for i in range(y.shape[1]):
    model.fit(X_train, y_train.iloc[:,i])
    y_pred = model.predict(X_test)
    y_preds = np.concatenate((y_preds, y_pred), axis=0)

test_mse = mse(y_test.to_numpy().flatten("F"), y_preds)
test_pear = pearsonr(y_test.to_numpy().flatten("F"), y_preds)
print("MSE of the test set:", test_mse)
print("Pearson of the test set:", test_pear[0])

MSE of the test set: 4.487843535841501
Pearson of the test set: 0.8537708949008104


In [184]:
# LightGBM
model = LGBM(objective='regression',num_leaves=5,learning_rate=0.1,n_estimators=10)
y_preds = np.array([])
for i in range(y.shape[1]):
    model.fit(X_train, y_train.iloc[:,i])
    y_pred = model.predict(X_test)
    y_preds = np.concatenate((y_preds, y_pred), axis=0)

test_mse = mse(y_test.to_numpy().flatten("F"), y_preds)
test_pear = pearsonr(y_test.to_numpy().flatten("F"), y_preds)
print("MSE of the test set:", test_mse)
print("Pearson of the test set:", test_pear[0])

MSE of the test set: 3.4983781792748796
Pearson of the test set: 0.8375270304196588
