In [73]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# Подготовка датасета для загрузки в сеть

In [74]:
base_df = pd.read_csv('data/clean_frame.csv', index_col='ind')
base_df.columns

Index([&#39;ID&#39;, &#39;Код_группы&#39;, &#39;Год_Поступления&#39;, &#39;Год_Окончания_УЗ&#39;, &#39;Пособие&#39;,
       &#39;Общежитие&#39;, &#39;Наличие_Матери&#39;, &#39;Наличие_Отца&#39;, &#39;Опекунство&#39;, &#39;Село&#39;,
       &#39;Иностранец&#39;, &#39;КодФакультета&#39;, &#39;СрБаллАттестата&#39;, &#39;Статус&#39;, &#39;male&#39;,
       &#39;female&#39;, &#39;sex_summ&#39;, &#39;birth_year_int&#39;, &#39;basis&#39;, &#39;language&#39;, &#39;country&#39;,
       &#39;region&#39;, &#39;city&#39;, &#39;parents_country&#39;],
      dtype=&#39;object&#39;)

### Проверка и удаление отрицательных значений

In [75]:
base_df.isna().any().all()

False

In [76]:
base_df = base_df.fillna(0)

In [77]:
columns = ['Код_группы', 'Год_Поступления', 'Год_Окончания_УЗ', 'Пособие',
       'Общежитие', 'Наличие_Матери', 'Наличие_Отца', 'Опекунство', 'Село',
       'Иностранец', 'КодФакультета', 'СрБаллАттестата', 'Статус', 'male',
        'birth_year_int', 'basis', 'language', 'country', 'parents_country']
#columns = base_df.columns.to_list()
priznaki = len(columns)
priznaki

19

In [78]:
frame_to_train = base_df.loc[:, columns]
frame_to_train.head(2)

Unnamed: 0_level_0,Код_группы,Год_Поступления,Год_Окончания_УЗ,Пособие,Общежитие,Наличие_Матери,Наличие_Отца,Опекунство,Село,Иностранец,КодФакультета,СрБаллАттестата,Статус,male,birth_year_int,basis,language,country,parents_country
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,16019,2015,2014.0,0.0,0.0,0,0.0,0.0,0.0,0.0,41.0,4.294,3,0,1998,2,1,8,10
1,14895,2010,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0,28.0,67.0,4,1,1992,4,0,8,10


## Создание тренировочного и тестового датафремов

In [79]:
train, test = train_test_split(frame_to_train, test_size=0.20)

In [80]:
train_input = train.drop('Статус', axis=1)
train_output = train['Статус']


In [81]:
input_arr = train_input.to_numpy()
input_df_size = len(input_arr)

In [82]:
output = np.ones((1,input_df_size))
output[0] = train_output.to_numpy()
output_arr = output.reshape(input_df_size,1)
output_arr

array([[4.],
       [4.],
       [4.],
       ...,
       [4.],
       [4.],
       [3.]])

In [83]:
test_input = test.drop('Статус', axis=1)
test_output = test['Статус']

In [84]:
test_arr = test_input.to_numpy()
test_arr_size = len(test_arr)

In [85]:
test = np.ones((1,test_arr_size))
test[0] = test_output.to_numpy()
test_out = test.reshape(test_arr_size,1)
test_out

array([[4.],
       [4.],
       [4.],
       ...,
       [3.],
       [3.],
       [4.]])

# SKlearn модель обучения

In [86]:
clf = RandomForestClassifier(random_state=0)

In [87]:
train_input.head(2)

Unnamed: 0_level_0,Код_группы,Год_Поступления,Год_Окончания_УЗ,Пособие,Общежитие,Наличие_Матери,Наличие_Отца,Опекунство,Село,Иностранец,КодФакультета,СрБаллАттестата,male,birth_year_int,basis,language,country,parents_country
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3357,17674,2013,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0,51.0,49.0,0,1995,4,1,8,10
9760,18191,2014,2014.0,0.0,1.0,0,0.0,0.0,1.0,0.0,40.0,73.0,0,1996,2,2,8,10


In [88]:
train_input.shape

(10867, 18)

In [89]:
clf.fit(train_input, train_output)

## Оценка качества модели

In [90]:
pred = clf.predict(test_input)

In [91]:
test_output.head(3)

ind
3703     4
4616     4
13254    4
Name: Статус, dtype: int64

In [92]:
f1_score(test_output, pred, average='macro', zero_division = 0)

0.7522114993726753

## Формирование файла для загрузки на сайт

In [93]:
df_test_base = pd.read_csv("data/test_frame.csv")
df_submission = pd.read_csv("data/sample_submission.csv")

In [94]:
df_test_base.columns

Index([&#39;ind&#39;, &#39;ID&#39;, &#39;Код_группы&#39;, &#39;Год_Поступления&#39;, &#39;Год_Окончания_УЗ&#39;,
       &#39;Пособие&#39;, &#39;Общежитие&#39;, &#39;Наличие_Матери&#39;, &#39;Наличие_Отца&#39;, &#39;Опекунство&#39;,
       &#39;Село&#39;, &#39;Иностранец&#39;, &#39;КодФакультета&#39;, &#39;СрБаллАттестата&#39;, &#39;male&#39;,
       &#39;female&#39;, &#39;sex_summ&#39;, &#39;birth_year_int&#39;, &#39;Статус&#39;, &#39;basis&#39;, &#39;language&#39;,
       &#39;country&#39;, &#39;parents_country&#39;],
      dtype=&#39;object&#39;)

In [95]:
columns

[&#39;Код_группы&#39;,
 &#39;Год_Поступления&#39;,
 &#39;Год_Окончания_УЗ&#39;,
 &#39;Пособие&#39;,
 &#39;Общежитие&#39;,
 &#39;Наличие_Матери&#39;,
 &#39;Наличие_Отца&#39;,
 &#39;Опекунство&#39;,
 &#39;Село&#39;,
 &#39;Иностранец&#39;,
 &#39;КодФакультета&#39;,
 &#39;СрБаллАттестата&#39;,
 &#39;Статус&#39;,
 &#39;male&#39;,
 &#39;birth_year_int&#39;,
 &#39;basis&#39;,
 &#39;language&#39;,
 &#39;country&#39;,
 &#39;parents_country&#39;]

In [96]:
df_test = df_test_base.loc[:, columns]
#df_test = df_test_base.drop('ind', axis=1)
#df_test = df_test.drop('ID', axis=1)
df_test = df_test.drop('Статус', axis=1)
df_test

Unnamed: 0,Код_группы,Год_Поступления,Год_Окончания_УЗ,Пособие,Общежитие,Наличие_Матери,Наличие_Отца,Опекунство,Село,Иностранец,КодФакультета,СрБаллАттестата,male,birth_year_int,basis,language,country,parents_country
0,20608,2014,2014.0,0.0,0.0,0,0.0,0.0,0.0,0.0,40.0,49.0,0,1995,1,1,8,10
1,20613,2015,2014.0,0.0,0.0,0,0.0,0.0,1.0,0.0,26.0,77.0,0,1995,1,2,8,10
2,21210,2018,1997.0,0.0,0.0,1,1.0,0.0,0.0,0.0,34.0,74.0,0,1973,2,1,8,10
3,22254,2015,2006.0,0.0,0.0,0,0.0,0.0,0.0,0.0,53.0,57.0,0,1983,2,1,8,10
4,15040,2012,,0.0,0.0,1,1.0,0.0,0.0,0.0,36.0,55.0,0,1994,2,0,8,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6630,20680,2016,2011.0,0.0,1.0,1,1.0,0.0,0.0,1.0,34.0,77.0,1,1992,0,1,6,7
6631,16921,2013,,0.0,0.0,1,1.0,0.0,0.0,0.0,36.0,62.0,0,1995,2,0,8,10
6632,19400,2014,2014.0,0.0,0.0,0,0.0,0.0,0.0,0.0,26.0,52.0,0,1996,3,1,8,10
6633,18152,2014,2014.0,0.0,1.0,0,0.0,0.0,1.0,0.0,27.0,69.0,1,1996,2,2,8,10


In [97]:
df_test = df_test.fillna(0)
df_test.shape

(6635, 18)

In [98]:
df_test.columns.to_list()

[&#39;Код_группы&#39;,
 &#39;Год_Поступления&#39;,
 &#39;Год_Окончания_УЗ&#39;,
 &#39;Пособие&#39;,
 &#39;Общежитие&#39;,
 &#39;Наличие_Матери&#39;,
 &#39;Наличие_Отца&#39;,
 &#39;Опекунство&#39;,
 &#39;Село&#39;,
 &#39;Иностранец&#39;,
 &#39;КодФакультета&#39;,
 &#39;СрБаллАттестата&#39;,
 &#39;male&#39;,
 &#39;birth_year_int&#39;,
 &#39;basis&#39;,
 &#39;language&#39;,
 &#39;country&#39;,
 &#39;parents_country&#39;]

In [99]:
df_test_pred = clf.predict(df_test)

In [100]:
df_test_base["Статус"] = df_test_pred
df_test_base.head(4)

Unnamed: 0,ind,ID,Код_группы,Год_Поступления,Год_Окончания_УЗ,Пособие,Общежитие,Наличие_Матери,Наличие_Отца,Опекунство,...,СрБаллАттестата,male,female,sex_summ,birth_year_int,Статус,basis,language,country,parents_country
0,0,64996,20608,2014,2014.0,0.0,0.0,0,0.0,0.0,...,49.0,0,1,1,1995,4,1,1,8,10
1,1,71837,20613,2015,2014.0,0.0,0.0,0,0.0,0.0,...,77.0,0,1,1,1995,4,1,2,8,10
2,2,86587,21210,2018,1997.0,0.0,0.0,1,1.0,0.0,...,74.0,0,1,1,1973,3,2,1,8,10
3,3,73673,22254,2015,2006.0,0.0,0.0,0,0.0,0.0,...,57.0,0,1,1,1983,3,2,1,8,10


In [101]:
df_submission

Unnamed: 0,ID,Статус
0,64996,
1,71837,
2,86587,
3,73673,
4,54709,
...,...,...
6686,74342,
6687,54876,
6688,66879,
6689,64982,


In [102]:
make_csv = df_test_base.loc[:, ['ID', 'Статус']]
#make_csv['reg'] = np.arange(0, make_csv.shape[0])
make_csv

Unnamed: 0,ID,Статус
0,64996,4
1,71837,4
2,86587,3
3,73673,3
4,54709,4
...,...,...
6630,74342,4
6631,54876,4
6632,66879,4
6633,64982,4


In [103]:
id_hum = 64996
make_csv.loc[make_csv['ID'] == id_hum]

Unnamed: 0,ID,Статус
0,64996,4


In [104]:
for i in range(df_submission.shape[0]):
    id_hum = df_submission.loc[i, ['ID']]
    #print(id_hum)
    status = make_csv.loc[make_csv['ID'] == id_hum.values[0], 'Статус']
    try:
        df_submission.loc[i, ['Статус']] = status.values[0]
    except:
        df_submission.loc[i, ['Статус']] = 4
    print(df_submission.loc[i, ['Статус']])

  4.0
Name: 6203, dtype: float64
Статус    4.0
Name: 6204, dtype: float64
Статус    4.0
Name: 6205, dtype: float64
Статус    3.0
Name: 6206, dtype: float64
Статус    4.0
Name: 6207, dtype: float64
Статус    4.0
Name: 6208, dtype: float64
Статус    4.0
Name: 6209, dtype: float64
Статус    3.0
Name: 6210, dtype: float64
Статус    4.0
Name: 6211, dtype: float64
Статус    4.0
Name: 6212, dtype: float64
Статус    4.0
Name: 6213, dtype: float64
Статус   -1.0
Name: 6214, dtype: float64
Статус    4.0
Name: 6215, dtype: float64
Статус    4.0
Name: 6216, dtype: float64
Статус    4.0
Name: 6217, dtype: float64
Статус    4.0
Name: 6218, dtype: float64
Статус    3.0
Name: 6219, dtype: float64
Статус    4.0
Name: 6220, dtype: float64
Статус    3.0
Name: 6221, dtype: float64
Статус    4.0
Name: 6222, dtype: float64
Статус    3.0
Name: 6223, dtype: float64
Статус    3.0
Name: 6224, dtype: float64
Статус    3.0
Name: 6225, dtype: float64
Статус    4.0
Name: 6226, dtype: float64
Статус    3.0
Name: 6227

In [109]:
df_submission.to_csv('data/result/sample_submission_sklearn_3.csv', index=False)

In [106]:
np.arange(0, 10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [107]:
make_csv.isna().any().all()

False

In [112]:
df_submission.groupby('Статус').count()

Unnamed: 0_level_0,ID
Статус,Unnamed: 1_level_1
-1.0,183
3.0,2124
4.0,4384


In [122]:
all_columns_list = base_df.columns.to_list()
all_columns_list.remove('ID')
all_columns_list.remove('Статус')
all_columns_list

[&#39;Код_группы&#39;,
 &#39;Год_Поступления&#39;,
 &#39;Год_Окончания_УЗ&#39;,
 &#39;Пособие&#39;,
 &#39;Общежитие&#39;,
 &#39;Наличие_Матери&#39;,
 &#39;Наличие_Отца&#39;,
 &#39;Опекунство&#39;,
 &#39;Село&#39;,
 &#39;Иностранец&#39;,
 &#39;КодФакультета&#39;,
 &#39;СрБаллАттестата&#39;,
 &#39;male&#39;,
 &#39;female&#39;,
 &#39;sex_summ&#39;,
 &#39;birth_year_int&#39;,
 &#39;basis&#39;,
 &#39;language&#39;,
 &#39;country&#39;,
 &#39;region&#39;,
 &#39;city&#39;,
 &#39;parents_country&#39;]

In [123]:
elements_num = len(all_columns_list)
elements_num

22

In [133]:
all_combinations = []
list_sizes = 0
for i in range(2, elements_num+1):
    curent_list = list(combinations(all_columns_list, i))
    list_sizes += len(curent_list)
    all_combinations = all_combinations + curent_list
len(all_combinations) - list_sizes

0

In [134]:
for columns in all_combinations:
    print(columns)


39;, &#39;Пособие&#39;, &#39;Общежитие&#39;, &#39;Наличие_Матери&#39;, &#39;Наличие_Отца&#39;, &#39;Опекунство&#39;, &#39;Село&#39;, &#39;Иностранец&#39;, &#39;КодФакультета&#39;, &#39;СрБаллАттестата&#39;, &#39;male&#39;, &#39;female&#39;, &#39;sex_summ&#39;, &#39;birth_year_int&#39;, &#39;basis&#39;, &#39;language&#39;, &#39;country&#39;, &#39;region&#39;, &#39;city&#39;, &#39;parents_country&#39;)
(&#39;Год_Поступления&#39;, &#39;Год_Окончания_УЗ&#39;, &#39;Пособие&#39;, &#39;Общежитие&#39;, &#39;Наличие_Матери&#39;, &#39;Наличие_Отца&#39;, &#39;Опекунство&#39;, &#39;Село&#39;, &#39;Иностранец&#39;, &#39;КодФакультета&#39;, &#39;СрБаллАттестата&#39;, &#39;male&#39;, &#39;female&#39;, &#39;sex_summ&#39;, &#39;birth_year_int&#39;, &#39;basis&#39;, &#39;language&#39;, &#39;country&#39;, &#39;region&#39;, &#39;city&#39;)
(&#39;Год_Поступления&#39;, &#39;Год_Окончания_УЗ&#39;, &#39;Пособие&#39;, &#39;Общежитие&#39;, &#39;Наличие_Матери&#39;, &#39;Наличие_Отца&#39;, &#39;Опекунство&#39;, 