In [22]:
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import ensemble 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [100]:
models = {
    #ensemble.AdaBoostClassifier: 1,
    #ensemble.AdaBoostRegressor: 2,
    ensemble.BaggingClassifier: 1,
    #ensemble.BaggingRegressor,
    #ensemble.ExtraTreesClassifier: (),
    #ensemble.ExtraTreesRegressor,
    ensemble.GradientBoostingClassifier: 2,
    #ensemble.RandomForestClassifier: (),
    #ensemble.RandomForestRegressor,
    #ensemble.RandomTreesEmbedding,
    #ensemble.HistGradientBoostingRegressor,
    #ensemble.HistGradientBoostingClassifier: (),
}

# Подготовка датасета для загрузки в сеть

In [101]:
base_df = pd.read_csv('data/clean_frame.csv', index_col='ind')
base_df.columns

Index(['ID', 'Код_группы', 'Год_Поступления', 'Год_Окончания_УЗ', 'Пособие',
       'Общежитие', 'Наличие_Матери', 'Наличие_Отца', 'Опекунство', 'Село',
       'Иностранец', 'КодФакультета', 'СрБаллАттестата', 'Статус', 'male',
       'female', 'sex_summ', 'birth_year_int', 'basis', 'language', 'country',
       'region', 'city', 'parents_country'],
      dtype='object')

### Проверка и удаление отрицательных значений

In [102]:
base_df.isna().any().all()

False

In [103]:
base_df = base_df.fillna(0)

In [104]:
columns = ['Код_группы','Пособие','Наличие_Матери','КодФакультета','basis','country','Статус']
#columns = base_df.columns.to_list()
priznaki = len(columns)
priznaki

5

In [105]:
frame_to_train = base_df.loc[:, columns]
frame_to_train.head(2)

Unnamed: 0_level_0,Код_группы,basis,parents_country,Опекунство,Статус
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,16019,2,10,0.0,3
1,14895,4,10,0.0,4


## Создание тренировочного и тестового датафремов

In [106]:
train, test = train_test_split(frame_to_train, test_size=0.20)

In [107]:
train_input = train.drop('Статус', axis=1)
train_output = train['Статус']


In [108]:
input_arr = train_input.to_numpy()
input_df_size = len(input_arr)

In [109]:
output = np.ones((1,input_df_size))
output[0] = train_output.to_numpy()
output_arr = output.reshape(input_df_size,1)
output_arr

array([[3.],
       [4.],
       [4.],
       ...,
       [3.],
       [4.],
       [3.]])

In [110]:
test_input = test.drop('Статус', axis=1)
test_output = test['Статус']

In [111]:
test_arr = test_input.to_numpy()
test_arr_size = len(test_arr)

In [112]:
test = np.ones((1,test_arr_size))
test[0] = test_output.to_numpy()
test_out = test.reshape(test_arr_size,1)
test_out

array([[ 4.],
       [ 4.],
       [ 4.],
       ...,
       [-1.],
       [ 3.],
       [ 4.]])

# SKlearn модель обучения

In [155]:
#clf = RandomForestClassifier(random_state=0)
clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=5, random_state=0)

In [75]:
train_input.head(2)

Unnamed: 0_level_0,Код_группы,basis,parents_country,Опекунство
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8485,15890,0,10,0.0
3497,20095,2,10,0.0


In [156]:
train_input.shape

(10867, 4)

In [157]:
clf.fit(train_input, train_output)

## Цикл классификаторов

In [116]:
accure_list = []
for model, config in models.items():
    print(model)
    if config == 1:
        clf_cycle = model(random_state=0)
    if config == 2:
        clf_cycle = model(n_estimators=200, learning_rate=0.1, max_depth=10, random_state=0)
    clf_cycle.fit(train_input, train_output)
    pred_cycle =  clf_cycle.predict(test_input)
    accure_cycle = f1_score(test_output, pred_cycle, average='macro', zero_division = 0)
    accure_list.append(accure_cycle)
accure_list

<class 'sklearn.ensemble._bagging.BaggingClassifier'>
<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>


[0.7645617017163788, 0.7752209962491196]

## Оценка качества модели

In [158]:
pred = clf.predict(test_input)

In [120]:
test_output.head(3)

ind
8512    4
8624    4
5139    4
Name: Статус, dtype: int64

In [159]:
accure = f1_score(test_output, pred, average='macro', zero_division = 0)
accure

0.7665431335126375

## Формирование файла для загрузки на сайт

In [122]:
df_test_base = pd.read_csv("data/test_frame.csv")
df_submission = pd.read_csv("data/sample_submission.csv")

In [123]:
df_test_base.columns

Index(['ind', 'ID', 'Код_группы', 'Год_Поступления', 'Год_Окончания_УЗ',
       'Пособие', 'Общежитие', 'Наличие_Матери', 'Наличие_Отца', 'Опекунство',
       'Село', 'Иностранец', 'КодФакультета', 'СрБаллАттестата', 'male',
       'female', 'sex_summ', 'birth_year_int', 'Статус', 'basis', 'language',
       'country', 'parents_country'],
      dtype='object')

In [124]:
columns = ['Код_группы','Пособие','Наличие_Матери','КодФакультета','basis','country','Статус']

['Код_группы', 'basis', 'parents_country', 'Опекунство', 'Статус']

In [125]:
df_test = df_test_base.loc[:, columns]
#df_test = df_test_base.drop('ind', axis=1)
#df_test = df_test.drop('ID', axis=1)
df_test = df_test.drop('Статус', axis=1)
df_test

Unnamed: 0,Код_группы,basis,parents_country,Опекунство
0,20608,1,10,0.0
1,20613,1,10,0.0
2,21210,2,10,0.0
3,22254,2,10,0.0
4,15040,2,10,0.0
...,...,...,...,...
6630,20680,0,7,0.0
6631,16921,2,10,0.0
6632,19400,3,10,0.0
6633,18152,2,10,0.0


In [126]:
df_test = df_test.fillna(0)
df_test.shape

(6635, 4)

In [127]:
df_test.columns.to_list()

['Код_группы', 'basis', 'parents_country', 'Опекунство']

In [128]:
df_test_pred = clf.predict(df_test)

In [129]:
df_test_base["Статус"] = df_test_pred
df_test_base.head(4)

Unnamed: 0,ind,ID,Код_группы,Год_Поступления,Год_Окончания_УЗ,Пособие,Общежитие,Наличие_Матери,Наличие_Отца,Опекунство,...,СрБаллАттестата,male,female,sex_summ,birth_year_int,Статус,basis,language,country,parents_country
0,0,64996,20608,2014,2014.0,0.0,0.0,0,0.0,0.0,...,49.0,0,1,1,1995,4,1,1,8,10
1,1,71837,20613,2015,2014.0,0.0,0.0,0,0.0,0.0,...,77.0,0,1,1,1995,4,1,2,8,10
2,2,86587,21210,2018,1997.0,0.0,0.0,1,1.0,0.0,...,74.0,0,1,1,1973,3,2,1,8,10
3,3,73673,22254,2015,2006.0,0.0,0.0,0,0.0,0.0,...,57.0,0,1,1,1983,3,2,1,8,10


In [130]:
df_submission

Unnamed: 0,ID,Статус
0,64996,
1,71837,
2,86587,
3,73673,
4,54709,
...,...,...
6686,74342,
6687,54876,
6688,66879,
6689,64982,


In [131]:
make_csv = df_test_base.loc[:, ['ID', 'Статус']]
#make_csv['reg'] = np.arange(0, make_csv.shape[0])
make_csv

Unnamed: 0,ID,Статус
0,64996,4
1,71837,4
2,86587,3
3,73673,3
4,54709,4
...,...,...
6630,74342,4
6631,54876,4
6632,66879,3
6633,64982,4


In [132]:
id_hum = 64996
make_csv.loc[make_csv['ID'] == id_hum]

Unnamed: 0,ID,Статус
0,64996,4


In [133]:
for i in range(df_submission.shape[0]):
    id_hum = df_submission.loc[i, ['ID']]
    #print(id_hum)
    status = make_csv.loc[make_csv['ID'] == id_hum.values[0], 'Статус']
    try:
        df_submission.loc[i, ['Статус']] = status.values[0]
    except:
        df_submission.loc[i, ['Статус']] = 4
    print(df_submission.loc[i, ['Статус']])

Статус    4.0
Name: 0, dtype: float64
Статус    4.0
Name: 1, dtype: float64
Статус    3.0
Name: 2, dtype: float64
Статус    3.0
Name: 3, dtype: float64
Статус    4.0
Name: 4, dtype: float64
Статус    3.0
Name: 5, dtype: float64
Статус    3.0
Name: 6, dtype: float64
Статус   -1.0
Name: 7, dtype: float64
Статус    4.0
Name: 8, dtype: float64
Статус    4.0
Name: 9, dtype: float64
Статус    3.0
Name: 10, dtype: float64
Статус    4.0
Name: 11, dtype: float64
Статус    4.0
Name: 12, dtype: float64
Статус    3.0
Name: 13, dtype: float64
Статус    3.0
Name: 14, dtype: float64
Статус    3.0
Name: 15, dtype: float64
Статус    3.0
Name: 16, dtype: float64
Статус    4.0
Name: 17, dtype: float64
Статус    4.0
Name: 18, dtype: float64
Статус    4.0
Name: 19, dtype: float64
Статус    4.0
Name: 20, dtype: float64
Статус    3.0
Name: 21, dtype: float64
Статус    4.0
Name: 22, dtype: float64
Статус    4.0
Name: 23, dtype: float64
Статус    3.0
Name: 24, dtype: float64
Статус    3.0
Name: 25, dtype: floa

In [134]:
df_submission.to_csv(f'data/result/{accure:.3f}_sample_submission_sklearn_{"".join(columns)}.csv', index=False)

In [821]:
np.arange(0, 10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [822]:
make_csv.isna().any().all()

False

In [823]:
df_submission.groupby('Статус').count()

Unnamed: 0_level_0,ID
Статус,Unnamed: 1_level_1
-1.0,274
3.0,2131
4.0,4286
