In [7]:
import numpy as np
import pandas as pd
data = pd.read_csv('predict.csv')


In [8]:
data

Unnamed: 0,recency,topic_rank,diversity,authors_mean_rank,authors_mean_hindex,authors_mean_gindex,authors_mean_sociality,authors_mean_pagerank,authors_mean_productivity,journal_pagerank,journal_rank,title_len,abstract_len,n_authors,c5,log_authors_mean_sociality
0,16,1.0,-0.425436,500.0,1.0,1.0,6.0,,,,101.0,66,1653,7,1.0,1.945910
1,8,7.0,-0.571967,134.0,1.0,1.0,6.0,,,,24.0,59,1527,7,32.0,1.945910
2,15,19.0,-0.859644,235.0,1.0,1.0,5.0,,,,49.0,169,1554,6,5.0,1.791759
3,6,4.0,-1.054999,1124.0,1.0,1.0,10.0,,,,185.0,130,2121,11,7.0,2.397895
4,14,2.0,-0.476697,346.0,1.0,1.0,2.0,,,,72.0,100,432,3,7.0,1.098612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,25,6.0,-0.330948,528.0,0.0,0.0,6.0,,,,140.0,62,645,7,0.0,1.945910
475,6,12.0,-1.493480,2145.0,0.0,0.0,1.0,,,,416.0,45,714,2,0.0,0.693147
476,24,17.0,-1.466320,551.0,0.0,0.0,4.0,,,,146.0,84,0,5,0.0,1.609438
477,24,6.0,-1.556737,551.0,0.0,0.0,0.0,,,,146.0,56,0,1,0.0,0.000000


In [9]:
data.isnull().sum()

recency                         0
topic_rank                      0
diversity                       0
authors_mean_rank               6
authors_mean_hindex             6
authors_mean_gindex             6
authors_mean_sociality          6
authors_mean_pagerank         479
authors_mean_productivity     479
journal_pagerank              479
journal_rank                   39
title_len                       0
abstract_len                    0
n_authors                       0
c5                              0
log_authors_mean_sociality      6
dtype: int64

Видим, что в датасете у нас три пустых колонки, можно их удалить из датасета

In [10]:
data = data.drop(columns=['journal_pagerank','authors_mean_productivity','authors_mean_pagerank'])

Выведем список всех строк с NaN значением у признака/признаков

In [11]:
data[data.isna().any(axis = 1)]

Unnamed: 0,recency,topic_rank,diversity,authors_mean_rank,authors_mean_hindex,authors_mean_gindex,authors_mean_sociality,journal_rank,title_len,abstract_len,n_authors,c5,log_authors_mean_sociality
8,5,3.0,-1.464084,1282.0,1.0,1.0,3.0,,44,1191,4,8.0,1.386294
15,13,20.0,-0.17836,326.0,1.0,1.0,2.0,,169,1323,3,9.0,1.098612
19,14,19.0,-0.647923,1015.0,1.0,1.0,0.0,,32,275,1,2.0,0.0
22,8,2.0,-1.39048,94.0,1.0,1.0,8.0,,107,1987,9,46.0,2.197225
23,5,3.0,-1.22415,219.0,1.0,1.0,2.0,,101,1147,3,49.0,1.098612
56,7,6.0,-1.367137,1174.0,1.0,1.0,2.0,,165,1722,3,5.0,1.098612
81,6,14.0,-0.296585,1469.0,1.0,1.0,7.0,,108,2407,8,5.0,2.079442
85,21,4.0,-0.120934,106.0,1.0,1.0,10.0,,91,1350,11,7.0,2.397895
97,5,12.0,-0.974201,1935.0,1.0,1.0,5.0,,64,341,6,2.0,1.791759
107,6,7.0,-0.066097,1628.0,1.0,1.0,3.0,,161,2409,4,3.0,1.386294


Попробуем поработать с датасетом, где NaN значения заполнены нулями

In [12]:
data_zeros = data.fillna(0)

In [13]:
data_zeros.isnull().sum()

recency                       0
topic_rank                    0
diversity                     0
authors_mean_rank             0
authors_mean_hindex           0
authors_mean_gindex           0
authors_mean_sociality        0
journal_rank                  0
title_len                     0
abstract_len                  0
n_authors                     0
c5                            0
log_authors_mean_sociality    0
dtype: int64

Так как все признаки лежат в разных диапазонах значений, не лишним будет нормализовать их

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
X = data_zeros.drop(columns='c5')
y = data_zeros['c5']
X = scaler.fit_transform(X)

Разобьем наш датасет на обучающую и тренировочную выборки

Чтобы зафиксировать разбиение, воспользуемся np.random.seed

In [16]:
np.random.seed(21)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

#Baselines (LinearRegression, RandomForestRegressor, GradientBoosting)

In [19]:
from sklearn.metrics import r2_score

## LinearRegression

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [22]:
r2_score(y_test,model.predict(X_test))

0.4668344658807807

## RandomForestRegressor

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
model = RandomForestRegressor()
model.fit(X_train,y_train)

RandomForestRegressor()

In [25]:
r2_score(y_test,model.predict(X_test))

0.47720313849666496

##GradientBoosting

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

In [27]:
model = GradientBoostingRegressor()
model.fit(X_train,y_train)

GradientBoostingRegressor()

In [28]:
r2_score(y_test,model.predict(X_test))

0.670767946066359

Без настройки гиперпараметров из моделей лучше всего справился GradientBoostingRegressor, воспользуемся модулем sklearn GridSearchCV для нахождения лучших гиперпараметров для нашей модели

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
parameters = {'loss': ('ls', 'lad', 'huber', 'quantile'), 'learning_rate': [0.3,0.2,0.1,0.05,0.02],'n_estimators':[200,400,600,800]}

In [31]:
clf = GridSearchCV(model,parameters,'r2')



In [32]:
clf.fit(X_train,y_train)

GridSearchCV(estimator=GradientBoostingRegressor(),
             param_grid={'learning_rate': [0.3, 0.2, 0.1, 0.05, 0.02],
                         'loss': ('ls', 'lad', 'huber', 'quantile'),
                         'n_estimators': [200, 400, 600, 800]},
             scoring='r2')

In [33]:
clf.best_params_

{'learning_rate': 0.2, 'loss': 'lad', 'n_estimators': 600}

In [34]:
r2_score(y_test,clf.predict(X_test))

0.7660467931142049

Осталось посмотреть на важнейшие признаки для нашей модели

In [40]:
clf.best_estimator_.feature_importances_

array([0.03635739, 0.00807231, 0.01422659, 0.24694074, 0.23308048,
       0.234658  , 0.00284248, 0.19195715, 0.01004497, 0.01819986,
       0.00189999, 0.00172004])

Видно, что наиболее важными признаками для модели являются: authors_mean_rank, authors_mean_hindex и authors_mean_gindex.
И это логически правильно, ведь h-index и g-index как раз служат метрикой продуктивности и цитируемости автора.


In [37]:
data_zeros

Unnamed: 0,recency,topic_rank,diversity,authors_mean_rank,authors_mean_hindex,authors_mean_gindex,authors_mean_sociality,journal_rank,title_len,abstract_len,n_authors,c5,log_authors_mean_sociality
0,16,1.0,-0.425436,500.0,1.0,1.0,6.0,101.0,66,1653,7,1.0,1.945910
1,8,7.0,-0.571967,134.0,1.0,1.0,6.0,24.0,59,1527,7,32.0,1.945910
2,15,19.0,-0.859644,235.0,1.0,1.0,5.0,49.0,169,1554,6,5.0,1.791759
3,6,4.0,-1.054999,1124.0,1.0,1.0,10.0,185.0,130,2121,11,7.0,2.397895
4,14,2.0,-0.476697,346.0,1.0,1.0,2.0,72.0,100,432,3,7.0,1.098612
...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,25,6.0,-0.330948,528.0,0.0,0.0,6.0,140.0,62,645,7,0.0,1.945910
475,6,12.0,-1.493480,2145.0,0.0,0.0,1.0,416.0,45,714,2,0.0,0.693147
476,24,17.0,-1.466320,551.0,0.0,0.0,4.0,146.0,84,0,5,0.0,1.609438
477,24,6.0,-1.556737,551.0,0.0,0.0,0.0,146.0,56,0,1,0.0,0.000000
