In [167]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from typing import Dict, Tuple

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix

Будем использовать [датасет из первой лабораторной работы](https://www.kaggle.com/spscientist/students-performance-in-exams). Выполняя первую лабораторную работу, мы выяснили, что между признаками **math_score**, **reading_score**, **writing_score** существует корреляция, поэтому мы будем использовать агрегированное значение: среднее арифметическое от оценок - **Average_score**.

# 0. Подготовка

In [168]:
data = pd.read_csv('data/StudentsPerformance.csv', sep=",")

In [169]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [170]:
data[data.duplicated(keep=False)]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score


In [171]:
data['Average_score'] = data[['math score', 'reading score', 'writing score']].mean(axis=1)
data.drop(columns=['math score', 'reading score', 'writing score'], inplace=True)

In [172]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,Average_score
0,female,group B,bachelor's degree,standard,none,72.666667
1,female,group C,some college,standard,completed,82.333333
2,female,group B,master's degree,standard,none,92.666667
3,male,group A,associate's degree,free/reduced,none,49.333333
4,male,group C,some college,standard,none,76.333333


Выполняем шаги для обеспечения готовности датасета к подаче его на вход алгоритма, которые были изучены нами во второй лабораторной работе

In [173]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   Average_score                1000 non-null   float64
dtypes: float64(1), object(5)
memory usage: 47.0+ KB


In [174]:
data.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
Average_score                  0
dtype: int64

Наш целевой признак - Average_score.

Для остальных нечисловых признаков выполним преобразования в числовые.

## lunch

In [2]:
data['lunch'].unique()

NameError: name 'data' is not defined

In [None]:
data = pd.get_dummies(data, columns = ['lunch'])

### Gender

In [176]:
data['gender'].unique()

array(['female', 'male'], dtype=object)

In [177]:
data = pd.get_dummies(data, columns = ['gender'])

### Race/Ethnicity

In [178]:
data['race/ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [179]:
data = pd.get_dummies(data, columns = ['race/ethnicity'])

### Parental level of education

In [180]:
data['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [181]:
data = pd.get_dummies(data, columns = ['parental level of education'])

### Test preparation course

In [182]:
data['test preparation course'].unique()

array(['none', 'completed'], dtype=object)

In [183]:
data = pd.get_dummies(data, columns = ['test preparation course'])

### Average score

Признак Average score должен быть отмасштабирован

In [184]:
sc = MinMaxScaler()
data['Average_score'] = sc.fit_transform(data[['Average_score']])

In [185]:
data.head()

Unnamed: 0,lunch,Average_score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,test preparation course_completed,test preparation course_none
0,1,0.699634,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1
1,1,0.805861,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0
2,1,0.919414,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1
3,0,0.443223,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1
4,1,0.739927,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1


In [186]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   lunch                                           1000 non-null   int32  
 1   Average_score                                   1000 non-null   float64
 2   gender_female                                   1000 non-null   uint8  
 3   gender_male                                     1000 non-null   uint8  
 4   race/ethnicity_group A                          1000 non-null   uint8  
 5   race/ethnicity_group B                          1000 non-null   uint8  
 6   race/ethnicity_group C                          1000 non-null   uint8  
 7   race/ethnicity_group D                          1000 non-null   uint8  
 8   race/ethnicity_group E                          1000 non-null   uint8  
 9   parental level of education_associate's de

## Разделение выборки на обучающую и тестовую

In [187]:
data_X = data.drop(columns='lunch')
data_Y = data['lunch']

In [188]:
data_X_train, data_X_test, data_Y_train, data_Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)

In [189]:
def class_proportions(array: np.ndarray) -> Dict[int, Tuple[int, float]]:
    """
    Вычисляет пропорции классов
    array - массив, содержащий метки классов
    """
    # Получение меток классов и количества меток каждого класса 
    labels, counts = np.unique(array, return_counts=True)
    # Превращаем количество меток в процент их встречаемости
    # делим количество меток каждого класса на общее количество меток
    counts_perc = counts/array.size
    # Теперь sum(counts_perc)==1.0
    # Создаем результирующий словарь,
    # ключом словаря явлется метка класса, 
    # а значением словаря процент встречаемости метки
    res = dict()
    for label, count2 in zip(labels, zip(counts, counts_perc)):
        res[label] = count2
    return res

def print_class_proportions(array: np.ndarray):
    """
    Вывод пропорций классов
    """
    proportions = class_proportions(array)
    if len(proportions)>0:
        print('Метка \t Количество \t Процент встречаемости')
    for i in proportions:
        val, val_perc = proportions[i]
        val_perc_100 = round(val_perc * 100, 2)
        print('{} \t {} \t \t {}%'.format(i, val, val_perc_100))

In [190]:
print_class_proportions(data_Y_train)

Метка 	 Количество 	 Процент встречаемости
0 	 281 	 	 35.12%
1 	 519 	 	 64.88%


In [191]:
print_class_proportions(data_Y_test)

Метка 	 Количество 	 Процент встречаемости
0 	 74 	 	 37.0%
1 	 126 	 	 63.0%


In [192]:
print_class_proportions(data['lunch'])

Метка 	 Количество 	 Процент встречаемости
0 	 355 	 	 35.5%
1 	 645 	 	 64.5%


# 1. Обучение модели для произвольного K

Обучение для количества ближайших соседей = 3

In [193]:
knc_1 = KNeighborsClassifier(n_neighbors=3)
knc_1.fit(data_X_train, data_Y_train)
target_1 = knc_1.predict(data_X_test)

Обучение для количества ближайших соседей = 123

In [194]:
knc_2 = KNeighborsClassifier(n_neighbors=123)
knc_2.fit(data_X_train, data_Y_train)
target_2 = knc_2.predict(data_X_test)

Сэмплинг для выравнивания классов

In [195]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

sampling_strategy = 1
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
data_X_train_under, data_Y_train_under = rus.fit_resample(data_X_train, data_Y_train)

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
data_X_train_over, data_Y_train_over = ros.fit_resample(data_X_train, data_Y_train)

In [196]:
knc_over = KNeighborsClassifier(n_neighbors=3)
knc_over.fit(data_X_train_over, data_Y_train_over)
target_over = knc_over.predict(data_X_test)

knc_under = KNeighborsClassifier(n_neighbors=3)
knc_under.fit(data_X_train_under, data_Y_train_under)
target_under = knc_under.predict(data_X_test)

# 2. Метрики

## Метрика Accuracy

In [197]:
balanced_accuracy_score(data_Y_test, target_1)

0.5296010296010296

In [198]:
balanced_accuracy_score(data_Y_test, target_2)

0.5

In [199]:
balanced_accuracy_score(data_Y_test, target_over)

0.5313170313170313

In [200]:
balanced_accuracy_score(data_Y_test, target_under)

0.5119047619047619

## Метрика Confusion Matrix

In [201]:
data_Y_test.value_counts()

1    126
0     74
Name: lunch, dtype: int64

In [202]:
confusion_matrix(data_Y_test, target_1)

array([[22, 52],
       [30, 96]], dtype=int64)

In [203]:
confusion_matrix(data_Y_test, target_2)

array([[  0,  74],
       [  0, 126]], dtype=int64)

In [204]:
confusion_matrix(data_Y_test, target_over)

array([[34, 40],
       [50, 76]], dtype=int64)

In [205]:
confusion_matrix(data_Y_test, target_under)

array([[37, 37],
       [60, 66]], dtype=int64)

# Тест

In [258]:
data_X = data[['Average_score']]
data_Y = data['lunch']

In [259]:
data_X_train, data_X_test, data_Y_train, data_Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)

In [260]:
knc_1 = KNeighborsClassifier(n_neighbors=3)
knc_1.fit(data_X_train, data_Y_train)
target_1 = knc_1.predict(data_X_test)

knc_2 = KNeighborsClassifier(n_neighbors=123)
knc_2.fit(data_X_train, data_Y_train)
target_2 = knc_2.predict(data_X_test)

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

sampling_strategy = 1
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
data_X_train_under, data_Y_train_under = rus.fit_resample(data_X_train, data_Y_train)

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
data_X_train_over, data_Y_train_over = ros.fit_resample(data_X_train, data_Y_train)

knc_over = KNeighborsClassifier(n_neighbors=3)
knc_over.fit(data_X_train_over, data_Y_train_over)
target_over = knc_over.predict(data_X_test)

knc_under = KNeighborsClassifier(n_neighbors=3)
knc_under.fit(data_X_train_under, data_Y_train_under)
target_under = knc_under.predict(data_X_test)

In [261]:
balanced_accuracy_score(data_Y_test, target_1)

0.5780780780780781

In [262]:
balanced_accuracy_score(data_Y_test, target_2)

0.5930930930930931

In [263]:
balanced_accuracy_score(data_Y_test, target_over)

0.567031317031317

In [264]:
balanced_accuracy_score(data_Y_test, target_under)

0.5766838266838267

In [265]:
confusion_matrix(data_Y_test, target_1)

array([[28, 46],
       [28, 98]], dtype=int64)

In [266]:
confusion_matrix(data_Y_test, target_2)

array([[ 22,  52],
       [ 14, 112]], dtype=int64)

In [267]:
confusion_matrix(data_Y_test, target_2)

array([[ 22,  52],
       [ 14, 112]], dtype=int64)

In [268]:
confusion_matrix(data_Y_test, target_over)

array([[34, 40],
       [41, 85]], dtype=int64)

In [269]:
confusion_matrix(data_Y_test, target_under)

array([[46, 28],
       [59, 67]], dtype=int64)