In [113]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor

from typing import Dict, Tuple

from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score

Будем использовать [датасет из первой лабораторной работы](https://www.kaggle.com/spscientist/students-performance-in-exams). Выполняя первую лабораторную работу, мы выяснили, что между признаками **math_score**, **reading_score**, **writing_score** существует корреляция, поэтому мы будем использовать агрегированное значение: среднее арифметическое от оценок - **Average_score**.

# 0. Подготовка

In [114]:
data = pd.read_csv('data/StudentsPerformance.csv', sep=",")

In [115]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [116]:
data[data.duplicated(keep=False)]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score


In [117]:
data['Average_score'] = data[['math score', 'reading score', 'writing score']].mean(axis=1)
data.drop(columns=['math score', 'reading score', 'writing score'], inplace=True)

In [118]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,Average_score
0,female,group B,bachelor's degree,standard,none,72.666667
1,female,group C,some college,standard,completed,82.333333
2,female,group B,master's degree,standard,none,92.666667
3,male,group A,associate's degree,free/reduced,none,49.333333
4,male,group C,some college,standard,none,76.333333


Выполняем шаги для обеспечения готовности датасета к подаче его на вход алгоритма, которые были изучены нами во второй лабораторной работе

In [119]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   Average_score                1000 non-null   float64
dtypes: float64(1), object(5)
memory usage: 47.0+ KB


In [120]:
data.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
Average_score                  0
dtype: int64

Наш целевой признак - Average_score.

In [121]:
le = LabelEncoder()
data['lunch'] = le.fit_transform(data['lunch'])

Для остальных нечисловых признаков выполним преобразования в числовые.

### Gender

In [122]:
data['gender'].unique()

array(['female', 'male'], dtype=object)

In [123]:
data['gender'] = le.fit_transform(data['gender'])

### Race/Ethnicity

In [124]:
data['race/ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [125]:
data = pd.get_dummies(data, columns = ['race/ethnicity'])

### Parental level of education

In [126]:
data['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [127]:
data = pd.get_dummies(data, columns = ['parental level of education'])

### Test preparation course

In [128]:
data['test preparation course'].unique()

array(['none', 'completed'], dtype=object)

In [129]:
data['test preparation course'] = le.fit_transform(data['test preparation course'])

### Average score

Признак Average score должен быть отмасштабирован

In [130]:
#sc = MinMaxScaler()
#data['Average_score'] = sc.fit_transform(data[['Average_score']])

In [131]:
data.head()

Unnamed: 0,gender,lunch,test preparation course,Average_score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school
0,0,1,1,72.666667,0,1,0,0,0,0,1,0,0,0,0
1,0,1,0,82.333333,0,0,1,0,0,0,0,0,0,1,0
2,0,1,1,92.666667,0,1,0,0,0,0,0,0,1,0,0
3,1,0,1,49.333333,1,0,0,0,0,1,0,0,0,0,0
4,1,1,1,76.333333,0,0,1,0,0,0,0,0,0,1,0


In [132]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   gender                                          1000 non-null   int32  
 1   lunch                                           1000 non-null   int32  
 2   test preparation course                         1000 non-null   int32  
 3   Average_score                                   1000 non-null   float64
 4   race/ethnicity_group A                          1000 non-null   uint8  
 5   race/ethnicity_group B                          1000 non-null   uint8  
 6   race/ethnicity_group C                          1000 non-null   uint8  
 7   race/ethnicity_group D                          1000 non-null   uint8  
 8   race/ethnicity_group E                          1000 non-null   uint8  
 9   parental level of education_associate's de

## Разделение выборки на обучающую и тестовую

In [133]:
data_X = data.drop(columns='Average_score')
data_Y = data['Average_score']

In [134]:
data_X_train, data_X_test, data_Y_train, data_Y_test = train_test_split(data_X, data_Y, test_size=0.3, random_state=0)

# 1. Обучение модели для произвольного K

Обучение для количества ближайших соседей = 3

In [135]:
knc_1 = KNeighborsRegressor(n_neighbors=35)
knc_1.fit(data_X_train, data_Y_train)
target_1 = knc_1.predict(data_X_test)

# 2. Метрики

## 2.1. Метрика Mean absolute error

In [136]:
MAE_any = mean_absolute_error(data_Y_test, target_1)
print(f'Средняя абсолютная ошибка: {MAE_any}')

Средняя абсолютная ошибка: 9.833174603174603


## 2.2. Метрика Mean squared error

In [137]:
MSE_any = mean_squared_error(data_Y_test, target_1)
print(f'Средняя квадратичная ошибка: {MSE_any}')

Средняя квадратичная ошибка: 150.74777354497354


## 2.3. Метрика Root mean squared error

In [138]:
RMSE_any = mean_squared_error(data_Y_test, target_1, squared=False)
print(f'Корень из средней квадратичной ошибки: {RMSE_any}')

Корень из средней квадратичной ошибки: 12.277938489216076


## 2.4. Метрика Median Absolute Error

In [139]:
median_any = median_absolute_error(data_Y_test, target_1)
print(f'Медиана от абсолютной ошибки: {median_any}')

Медиана от абсолютной ошибки: 8.452380952380953


## 2.5. Метрика R2

In [140]:
R2_any = r2_score(data_Y_test, target_1)
print(f'Коэффициент детерминации: {R2_any}')

Коэффициент детерминации: 0.173587940218905
