In [47]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt
import numpy as np

In [48]:
df = pd.read_csv('grant_data_imb.csv', sep=',')
df

Unnamed: 0,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,...,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1
0,0,97A,30B,A,321024.0,50.0,321013.0,30.0,291502.0,20.0,...,2563.0,25.0,Yes,>10 to 15,2.0,6.0,3.0,5.0,15.0,3.0
1,0,36D,10A,G,300201.0,100.0,0.0,0.0,0.0,0.0,...,1038.0,1.0,,Less than 0,0.0,3.0,0.0,4.0,0.0,0.0
2,0,317A,30D,,321013.0,100.0,0.0,0.0,0.0,0.0,...,2763.0,25.0,Yes,>5 to 10,4.0,3.0,6.0,25.0,14.0,14.0
3,0,62B,10B,B,321103.0,30.0,321105.0,40.0,321204.0,30.0,...,2848.0,25.0,,Less than 0,1.0,2.0,1.0,0.0,0.0,0.0
4,0,1A,10A,,270603.0,60.0,321205.0,30.0,320603.0,10.0,...,2678.0,25.0,Yes,>5 to 10,5.0,14.0,0.0,9.0,7.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4108,0,103C,30B,,321022.0,100.0,0.0,0.0,0.0,0.0,...,2773.0,25.0,,>=0 to 5,0.0,1.0,0.0,0.0,0.0,0.0
4109,0,2B,10A,,340208.0,50.0,340499.0,50.0,0.0,0.0,...,1678.0,13.0,Yes,more than 15,0.0,1.0,1.0,5.0,8.0,0.0
4110,1,2B,10A,,320702.0,40.0,270102.0,30.0,320305.0,30.0,...,2653.0,25.0,,>=0 to 5,1.0,0.0,26.0,15.0,9.0,6.0
4111,0,40D,10B,C,,,,,,,...,3248.0,31.0,,>=0 to 5,0.0,1.0,1.0,0.0,0.0,0.0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4113 entries, 0 to 4112
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Grant.Status                            4113 non-null   int64  
 1   Sponsor.Code                            3856 non-null   object 
 2   Grant.Category.Code                     3856 non-null   object 
 3   Contract.Value.Band...see.note.A        1953 non-null   object 
 4   RFCD.Code.1                             3853 non-null   float64
 5   RFCD.Percentage.1                       3853 non-null   float64
 6   RFCD.Code.2                             3853 non-null   float64
 7   RFCD.Percentage.2                       3853 non-null   float64
 8   RFCD.Code.3                             3853 non-null   float64
 9   RFCD.Percentage.3                       3853 non-null   float64
 10  RFCD.Code.4                             3853 non-null   floa

In [50]:
features = df.drop(['Grant.Status'], axis=1)
target = df['Grant.Status']

In [51]:
from sklearn.impute import SimpleImputer

# Количественные признаки
numeric_features = features.select_dtypes(include=['float64']).columns
imputer_mean = SimpleImputer(strategy='mean')
imputer_zero = SimpleImputer(strategy='constant', fill_value=0)

features[numeric_features + '_mean'] = imputer_mean.fit_transform(features[numeric_features])
features[numeric_features + '_zero'] = imputer_zero.fit_transform(features[numeric_features])

# Удаление исходных столбцов с пропусками
features.drop(columns=numeric_features, inplace=True)

In [52]:
# Подсчет пропущенных значений в каждом столбце
missing_values = features.isna().sum()

# Вывод столбцов с пропущенными значениями
columns_with_missing_values = missing_values[missing_values > 0]
print("Столбцы с пропущенными значениями:")
print(columns_with_missing_values)

Столбцы с пропущенными значениями:
Sponsor.Code                               257
Grant.Category.Code                        257
Contract.Value.Band...see.note.A          2160
Role.1                                      46
Country.of.Birth.1                         666
Home.Language.1                           3719
With.PHD.1                                1781
No..of.Years.in.Uni.at.Time.of.Grant.1     759
dtype: int64


In [53]:
# Категориальные признаки
categorical_features = features.select_dtypes(include=['object']).columns
for feature in categorical_features:
    print(f"Уникальные значения для {feature}: {features[feature].unique()}")

Уникальные значения для Sponsor.Code: ['97A' '36D' '317A' '62B' '1A' '4D' '2B' '60D' '21A' '149A' '252D' '65A'
 '40D' '24D' '34B' '166B' '6B' '29A' '5A' '141A' nan '32D' '89A' '52D'
 '18B' '33A' '91C' '12D' '20D' '66B' '145A' '179C' '86B' '42B' '183C'
 '112D' '234B' '49A' '437A' '77A' '83C' '93A' '229A' '84D' '196D' '281A'
 '69A' '161A' '51C' '226B' '269A' '188D' '94B' '173A' '221A' '9A' '126B'
 '133A' '101A' '215C' '172D' '138B' '219C' '59C' '315C' '205A' '247C'
 '194B' '15C' '3C' '55C' '204D' '90B' '163C' '307C' '206B' '95C' '7C'
 '180D' '214B' '197A' '349A' '232D' '325A' '266B' '164D' '63C' '228D'
 '187C' '208D' '241A' '48D' '309A' '73A' '28D' '362B' '39C' '67C' '212D'
 '200D' '148D' '154B' '245A' '311C' '105A' '47C' '222B' '137A' '159C'
 '100D' '143C' '193A' '324D' '38B' '75C' '427C' '103C' '415C' '139C'
 '107C' '136D' '11C' '170B' '135C' '203C' '235C' '331C' '259C' '87C'
 '169A' '284D' '198B' '308D' '242B' '174B' '68D' '120D' '347C' '294B'
 '128D' '56D' '202B' '435C' '80D' '130B' 

In [54]:
features['With.PHD.1'].fillna('No', inplace=True)

In [55]:
# Заполнение пропусков в категориальных признаках значением по умолчанию 'Unknown'
imputer_categorical = SimpleImputer(strategy='constant', fill_value='Unknown')
features[categorical_features] = imputer_categorical.fit_transform(features[categorical_features])

In [56]:
# Подсчет пропущенных значений в каждом столбце
missing_values = features.isna().sum()

# Вывод столбцов с пропущенными значениями
columns_with_missing_values = missing_values[missing_values > 0]
print("Столбцы с пропущенными значениями:")
print(columns_with_missing_values)

Столбцы с пропущенными значениями:
Series([], dtype: int64)


In [57]:
# Применение прямого кодирования (One-Hot Encoding)
features = pd.get_dummies(features, columns=categorical_features, drop_first=True)

In [58]:
features.head()

Unnamed: 0,RFCD.Code.1_mean,RFCD.Percentage.1_mean,RFCD.Code.2_mean,RFCD.Percentage.2_mean,RFCD.Code.3_mean,RFCD.Percentage.3_mean,RFCD.Code.4_mean,RFCD.Percentage.4_mean,RFCD.Code.5_mean,RFCD.Percentage.5_mean,...,Country.of.Birth.1_Unknown,Country.of.Birth.1_Western Europe,Home.Language.1_Other,Home.Language.1_Unknown,With.PHD.1_Yes,No..of.Years.in.Uni.at.Time.of.Grant.1_>5 to 10,No..of.Years.in.Uni.at.Time.of.Grant.1_>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.1_Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.1_Unknown,No..of.Years.in.Uni.at.Time.of.Grant.1_more than 15
0,321024.0,50.0,321013.0,30.0,291502.0,20.0,0.0,0.0,0.0,0.0,...,0,0,0,1,1,0,0,0,0,0
1,300201.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1,0,0
2,321013.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,1,1,0,0,0,0
3,321103.0,30.0,321105.0,40.0,321204.0,30.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1,0,0
4,270603.0,60.0,321205.0,30.0,320603.0,10.0,0.0,0.0,0.0,0.0,...,0,0,0,1,1,1,0,0,0,0


In [59]:
# Выбор столбцов, содержащих "_Unknown" в названии
columns_to_drop = [col for col in features.columns if '_Unknown' in col]

# Удаление выбранных столбцов
features = features.drop(columns=columns_to_drop)

features


Unnamed: 0,RFCD.Code.1_mean,RFCD.Percentage.1_mean,RFCD.Code.2_mean,RFCD.Percentage.2_mean,RFCD.Code.3_mean,RFCD.Percentage.3_mean,RFCD.Code.4_mean,RFCD.Percentage.4_mean,RFCD.Code.5_mean,RFCD.Percentage.5_mean,...,Country.of.Birth.1_North America,Country.of.Birth.1_South Africa,Country.of.Birth.1_The Americas,Country.of.Birth.1_Western Europe,Home.Language.1_Other,With.PHD.1_Yes,No..of.Years.in.Uni.at.Time.of.Grant.1_>5 to 10,No..of.Years.in.Uni.at.Time.of.Grant.1_>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.1_Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.1_more than 15
0,321024.000000,50.00000,321013.000000,30.000000,291502.000000,20.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,1,0,0,0,0
1,300201.000000,100.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,1,0
2,321013.000000,100.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,1,1,0,0,0
3,321103.000000,30.00000,321105.000000,40.000000,321204.000000,30.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,1,0
4,270603.000000,60.00000,321205.000000,30.000000,320603.000000,10.000000,0.000000,0.000000,0.000000,0.000000,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4108,321022.000000,100.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,1,0,0
4109,340208.000000,50.00000,340499.000000,50.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,1,0,0,0,1
4110,320702.000000,40.00000,270102.000000,30.000000,320305.000000,30.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,1,0,0
4111,314904.682845,74.69686,161386.717104,17.642616,96437.197508,7.089541,6835.177005,0.442512,1767.989878,0.128471,...,0,0,0,0,0,0,0,1,0,0


In [60]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=23
)

In [61]:
scaler = StandardScaler()
scaler.fit(features_train)
features_train_sc = scaler.transform(features_train)
features_valid_sc = scaler.transform(features_valid)

In [62]:
pd.DataFrame(features_train_sc, columns=features_train.columns).head()

Unnamed: 0,RFCD.Code.1_mean,RFCD.Percentage.1_mean,RFCD.Code.2_mean,RFCD.Percentage.2_mean,RFCD.Code.3_mean,RFCD.Percentage.3_mean,RFCD.Code.4_mean,RFCD.Percentage.4_mean,RFCD.Code.5_mean,RFCD.Percentage.5_mean,...,Country.of.Birth.1_North America,Country.of.Birth.1_South Africa,Country.of.Birth.1_The Americas,Country.of.Birth.1_Western Europe,Home.Language.1_Other,With.PHD.1_Yes,No..of.Years.in.Uni.at.Time.of.Grant.1_>5 to 10,No..of.Years.in.Uni.at.Time.of.Grant.1_>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.1_Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.1_more than 15
0,1.385824,0.964852,-1.023501,-0.937759,-0.662551,-0.608113,-0.152343,-0.139354,-0.069868,-0.065297,...,-0.173379,-0.06753,-0.084763,-0.197696,-0.139657,-1.114858,-0.461238,-0.694248,2.695941,-0.315381
1,0.132931,-0.191662,0.711716,0.133354,1.662369,0.26535,-0.152343,-0.139354,-0.069868,-0.065297,...,-0.173379,-0.06753,-0.084763,-0.197696,-0.139657,0.896975,-0.461238,-0.694248,-0.370928,-0.315381
2,0.126379,-0.577166,1.03723,0.668911,1.240408,0.26535,-0.152343,-0.139354,-0.069868,-0.065297,...,-0.173379,-0.06753,-0.084763,-0.197696,-0.139657,0.896975,-0.461238,1.440408,-0.370928,-0.315381
3,1.385888,0.193843,1.41661,0.133354,-0.662551,-0.608113,-0.152343,-0.139354,-0.069868,-0.065297,...,-0.173379,-0.06753,-0.084763,-0.197696,-0.139657,0.896975,-0.461238,-0.694248,-0.370928,3.170767
4,0.137108,0.964852,-1.023501,-0.937759,-0.662551,-0.608113,-0.152343,-0.139354,-0.069868,-0.065297,...,-0.173379,-0.06753,-0.084763,-0.197696,-0.139657,0.896975,-0.461238,-0.694248,-0.370928,3.170767


In [63]:
df['Grant.Status'].value_counts()

0    3259
1     854
Name: Grant.Status, dtype: int64

Классы несбалансированы

In [64]:
def upsample(features, target, repeat=10):
    # разделяем объекты разных классов и информацию о них по разным переменным
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    # дублируем записи объектов более редкого класса
    if len(target_ones) > len(target_zeros):
        repeat = round(len(target_ones) / len(target_zeros))
        features_upsampled = pd.concat([features_ones] + [features_zeros] * repeat)
        target_upsampled = pd.concat([target_ones] + [target_zeros] * repeat)
    else:
        repeat = round(len(target_zeros) / len(target_ones))
        features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
        target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    # перемешиваем объекты
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=23)
    
    return features_upsampled, target_upsampled

In [65]:
features_train_sc = pd.DataFrame(features_train_sc, columns=features_train.columns, index=features_train.index)
features_valid_sc = pd.DataFrame(features_valid_sc, columns=features_valid.columns, index=features_valid.index)

In [66]:
features_train_upsampled, target_train_upsampled = upsample(features_train_sc, target_train)

In [67]:
features_valid

Unnamed: 0,RFCD.Code.1_mean,RFCD.Percentage.1_mean,RFCD.Code.2_mean,RFCD.Percentage.2_mean,RFCD.Code.3_mean,RFCD.Percentage.3_mean,RFCD.Code.4_mean,RFCD.Percentage.4_mean,RFCD.Code.5_mean,RFCD.Percentage.5_mean,...,Country.of.Birth.1_North America,Country.of.Birth.1_South Africa,Country.of.Birth.1_The Americas,Country.of.Birth.1_Western Europe,Home.Language.1_Other,With.PHD.1_Yes,No..of.Years.in.Uni.at.Time.of.Grant.1_>5 to 10,No..of.Years.in.Uni.at.Time.of.Grant.1_>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.1_Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.1_more than 15
1621,321007.0,60.0,321206.0,20.0,321204.0,20.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,1
2765,320603.0,50.0,320701.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2715,320702.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
114,240402.0,60.0,240201.0,30.0,240301.0,10.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
1868,291804.0,50.0,291801.0,30.0,290699.0,20.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,321208.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,1,0,0,0
2918,300305.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3379,400104.0,40.0,430199.0,30.0,420218.0,30.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2428,320202.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0


In [68]:
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(solver='liblinear', random_state=12, class_weight='balanced', cv=10)
model.fit(features_train_upsampled, target_train_upsampled)
roc_auc = roc_auc_score(target_valid, model.predict_proba(features_valid_sc)[:, 1])
print(f'ROC-AUC на валидационном наборе: {roc_auc}')


# Получение коэффициентов (весов) модели
coefficients = model.coef_[0]
# Получение индексов признаков, сортированных по абсолютным значениям коэффициентов
indices = np.argsort(np.abs(coefficients))[::-1]

# Получение топ-10 признаков
top_10_features = features.columns[indices][:10]

# Вывод топ-10 признаков
print('Топ-10 признаков по важности:')
for feature in top_10_features:
    print(feature)

ROC-AUC на валидационном наборе: 0.8178177903259473
Топ-10 признаков по важности:
Sponsor.Code_24D
Number.of.Unsuccessful.Grant.1_zero
Sponsor.Code_2B
Sponsor.Code_4D
Sponsor.Code_62B
Number.of.Successful.Grant.1_mean
Sponsor.Code_6B
Sponsor.Code_33A
Year.of.Birth.1_zero
Sponsor.Code_183C


In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

rf_model = RandomForestClassifier(random_state=12, class_weight='balanced')
grid_search = GridSearchCV(rf_model, param_grid, scoring='roc_auc', cv=5)
grid_search.fit(features_train_upsampled, target_train_upsampled)

# Получение наилучших параметров
best_params = grid_search.best_params_
print(f'Наилучшие параметры: {best_params}')

# Оценка качества модели на валидационном наборе
roc_auc = roc_auc_score(target_valid, grid_search.predict_proba(features_valid_sc)[:, 1])
print(f'ROC-AUC на валидационном наборе: {roc_auc}')

# Получение топ-10 признаков по важности
feature_importances = grid_search.best_estimator_.feature_importances_
indices = np.argsort(feature_importances)[::-1]
top_10_features = features.columns[indices][:10]

# Вывод топ-10 признаков
print('Топ-10 признаков по важности:')
for feature in top_10_features:
    print(feature)

Наилучшие параметры: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
ROC-AUC на валидационном наборе: 0.8524558647434277
Топ-10 признаков по важности:
Number.of.Unsuccessful.Grant.1_mean
Number.of.Unsuccessful.Grant.1_zero
Number.of.Successful.Grant.1_zero
Number.of.Successful.Grant.1_mean
Dept.No..1_zero
Person.ID.1_zero
RFCD.Code.1_zero
RFCD.Code.1_mean
Person.ID.1_mean
SEO.Code.1_mean


Для поставленной задачи лучшей моделью оказался случайный лес RandomForestClassifier. 
Важные признаки согласно обеим моделям: Number.of.Unsuccessful.Grant.1_zero, Number.of.Successful.Grant.1_mean