In [1]:
# Несколько функций для отрисовки графиков
import numpy as np


def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
   
    import matplotlib.pyplot as plt
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    import matplotlib.pyplot as plt
from sklearn import metrics

def plot_roc(test_label, preds_prob):

    # calculate the fpr and tpr for all thresholds of the classification
    fpr, tpr, threshold = metrics.roc_curve(test_label, preds_prob)
    roc_auc = metrics.auc(fpr, tpr)


    # plot auc
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')

    plt.xlim([0, 1])
    plt.ylim([0, 1])

    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

    plt.title('Receiver Operating Characteristic')

    plt.legend(loc = 'lower right')

    plt.show()


# <center>Майнор "Интеллектуальный анализ данных"</center>

# <center>Курс "Введение в анализ данных"</center>

# <center>Лабораторная работа №3. Supervised Learning by Kezikov B.</center> 

## Данные

В рамках данной лабораторной работы предлагается проанализировать набор данных о студентах двух школ в Португалии. В наборе данных `students_data.csv` представлена информация о студентах, посещающих два курса - математику (`Math`) и поргутальский язык (`Por`). Некоторые студенты представлены в обоих курсах, некоторые только в одном. Для каждого студента известны три оценки по курсу: оценка за первое полугодие (`G1`), оценка за второе полугодие (`G2`) и итоговая оценка за год (`G3`).

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os.path as path
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from matplotlib import style
style.use('seaborn')
%config InlineBackend.figure_format = 'svg'
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', -1)

In [4]:
dataframe = pd.read_csv("students_data.csv")
dataframe.head(10)

Unnamed: 0,ID,Subject,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,cheating,G1,G2,G3
0,100097,Por,GP,F,16,U,GT3,A,3,4,services,other,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3.0,2,1,1.0,4.0,5,12,,15,13,14
1,101021,Por,GP,F,17,U,GT3,T,3,2,other,other,course,mother,1,2,0,no,no,no,yes,no,yes,yes,no,5.0,3,4,1.0,3.0,3,2,,17,18,17
2,102965,Por,GP,M,16,U,LE3,T,1,2,health,services,course,mother,2,1,2,no,no,no,no,no,yes,yes,no,4.0,4,5,3.0,5.0,5,0,yes,9,8,10
3,102989,Por,MS,M,17,U,GT3,T,2,3,other,services,home,father,2,2,0,no,no,no,yes,yes,yes,yes,no,4.0,4,3,1.0,1.0,3,4,no,14,15,16
4,103131,Por,GP,F,16,U,GT3,T,1,1,at_home,other,home,mother,2,1,0,no,yes,no,no,yes,yes,no,,4.0,3,2,1.0,4.0,5,2,yes,12,13,13
5,103144,Por,GP,M,18,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,yes,yes,yes,yes,yes,3.0,3,4,4.0,5.0,4,2,,11,11,12
6,105257,Por,MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,0,no,no,no,yes,yes,yes,no,no,1.0,1,1,1.0,1.0,5,6,,11,12,9
7,107639,Por,MS,F,15,R,GT3,T,4,4,teacher,other,course,mother,2,1,0,no,no,no,no,yes,yes,yes,yes,1.0,5,1,3.0,5.0,5,0,,13,14,14
8,107914,Math,GP,F,15,U,LE3,T,3,2,services,other,reputation,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4.0,4,4,1.0,1.0,5,10,no,7,6,6
9,108089,Por,MS,M,16,R,GT3,T,3,4,other,health,other,mother,3,2,0,no,no,no,no,no,yes,no,no,3.0,4,5,1.0,2.0,5,4,,9,10,11


### Признаки

Данные представлены признаками различных типов: числовыми, категориальными, упорядоченными категориальными.

**Описание признаков:**

In [5]:
criteria = pd.read_csv('students_data_features.csv',
            delimiter=';',
            encoding='windows-1251')
list_criteria = criteria["Признак"]
criteria

Unnamed: 0,Признак,Описание
0,ID,Уникальный номер наблюдения
1,Subject,"Предмет: 'Math' - математика, 'Por' - португальский язык"
2,school,"Школа: 'GP' - Gabriel Pereira, 'MS' - Mousinho da Silveira"
3,sex,"Пол студента: 'F' - женский, 'M' - мужской"
4,age,Возраст студента
5,address,"Место проживания студента: 'U' - в городе, 'R' - за городом"
6,famsize,"Число членов семьи: 'LE3' - если <= 3, 'GT3' - если > 3"
7,Pstatus,"Отношения родителей: 'T' - живут вместе, 'A' - живут раздельно"
8,Medu,"Образование матери: 0 - без образования, 1 - начальная школа, 2 – от 5 до 9 классов, 3 – среднее образование, 4 – высшее образование"
9,Fedu,"Образование отца: 0 - без образования, 1 - начальная школа, 2 – от 5 до 9 классов, 3 – среднее образование, 4 – высшее образование"


## Предобработка данных
### Оцифровка  
* Для того, чтобы данные было наиболее удобно обрабатывать и прогонять через различные модели, проведем их оцифровку.

* Данную задачу выполним при помощи библиотеки sklearn 

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

quality = ['Subject', 'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

for i in range(len(quality)):
    le.fit(dataframe[quality[i]])
    dataframe[quality[i]+"_le"] = le.transform(dataframe[quality[i]])
    dataframe.drop(quality[i], axis=1, inplace=True)

dataframe.drop(["G2"], axis = 1, inplace = True)
ID = dataframe.ID
dataframe.drop(["ID"], axis = 1, inplace = True)

dataframe["g3"] = dataframe.G3
dataframe.drop(["G3"], axis = 1, inplace = True)
dataframe.head(20)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

### Feature engeneering
* Воспользуемся методом отбора признаков на основе их важности.
* Ансамблевые алгоритмы на основе деревьев решений, такие как случайный лес (random forest), позволяют оценить важность признаков.
* Обучим классификатор ExtraTreesClassifier, чтобы с его помощью определить важность признаков. 

In [None]:
dataframe.head(10)

In [None]:
X = dataframe.values[:,0:len(dataframe.columns) - 1]
Y = dataframe.values[:,len(dataframe.columns) - 1] 

In [None]:
sns.heatmap(dataframe.corr())

по графику можем заметить, что некоторые переменные следует убрать, из-за бесполезности

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X, Y)
cols = dataframe.columns[:len(dataframe.columns)-1]

df = pd.DataFrame()
df["features"] = cols
df["weights"] = model.feature_importances_
df = df.sort_values(by="weights",ascending = False)
df

оставим только те элементы, которые входят в 75% важности

In [None]:
i = 0
flag = 0
while flag < 0.75:
    flag = flag+df["weights"][i]
    i = i + 1

print("На ", i, "элементе, сумма становится больше 75%")
df = df.iloc[i:]
print("Список ненужных признаков:\n")
df

исключаем эти признаки из dataframe

In [None]:
dataframe.drop([x for x in df["features"]], axis=1, inplace=True)

In [None]:
dataframe.head(5)

## Регрессия

Одним из пунктов является проверка признака G1 на итоговый результат, но если взять данные выше, в которых G1 - самый полезный признак, уже сейчас можно сказать, что с ним будут более точные результаты, чем без него.

Разбиваем по признаку Subject на математиков и португальцев

In [None]:
subj = dataframe["Subject_le"] == 1
data_por, data_math = dataframe[subj], dataframe[~subj]
noG1_math= data_math.copy().drop(['G1'],axis=1)
noG1_por = data_por.copy().drop(['G1'],axis=1)

### Линейная регрессия

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics


def linear_regression(dataset,name):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['g3','Subject_le'], axis=1),
                                                dataset['g3'],
                                                test_size=0.33, 
                                                random_state=42)

    model = linear_model.LinearRegression()
    model.fit(trainData, trainLabels)
    prediction = model.predict(testData)

    plt.plot(prediction[1:100], 'r-', label='Predicted')
    plt.plot(testLabels[1:100].values, 'b-', label='Correct')
    plt.xlabel("Observations <Linear> ")
    plt.ylabel("Marks")
    plt.legend(loc='best')
    plt.show()
 
    MSE = metrics.mean_squared_error(y_pred=prediction, 
                                     y_true=testLabels) 

    RMSE = np.sqrt(MSE)

    MAE = metrics.mean_absolute_error(y_pred=prediction, 
                                      y_true=testLabels)

    MEDIAN = metrics.median_absolute_error(y_pred=prediction, 
                                          y_true=testLabels)

    R2 = metrics.r2_score(y_pred=prediction, 
                          y_true=testLabels) 
    print(name+" Metrics:")
    print("MSE:   {}\nRMSE:  {}\nR2:    {}\nMAE:   {}\nMedae: {}".format(MSE, RMSE, R2, MAE, MEDIAN))
    return prediction

In [None]:
math_lin = linear_regression(data_math.copy(), "Math")

In [None]:
por_lin = linear_regression(data_por.copy(),"Portugal")

In [None]:
noG1_math_lin = linear_regression(noG1_math.copy(),"Math noG1")

In [None]:
noG1_por_lin = linear_regression(noG1_por.copy(),"Portugal noG1")

И по графику и по параметрам R2 видно, что предсказывать значения, учитывая признак G1 более правильно, чем без него

### Гребневая регрессия

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def ridge_regression(dataset,name):
    from sklearn.metrics import mean_squared_error
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['g3','Subject_le'], axis=1),
                                                dataset['g3'],
                                                test_size=0.33, 
                                                random_state=42)

    ridgeModel = linear_model.RidgeCV(alphas=np.array(range(0, 50, 1))/10, 
                                  cv=5,
                                 scoring='neg_mean_squared_error')
    
    ridgeModel.fit(trainData, trainLabels)
    
    prediction = ridgeModel.predict(testData)

    plt.plot(prediction[1:100], 'r-', label='Predicted')
    plt.plot(testLabels[1:100].values, 'b-', label='Correct')
    plt.xlabel("Observations  <Ridge> ")
    plt.ylabel("Marks")
    plt.legend(loc='best')
    plt.show()
    # Рассчитаем основные метрики 
    MSE = metrics.mean_squared_error(y_pred=prediction, 
                                     y_true=testLabels) #чем ниже значение MSE, тем выше качество модели 

    RMSE = np.sqrt(MSE)

    MAE = metrics.mean_absolute_error(y_pred=prediction, 
                                      y_true=testLabels)

    MEDIAN = metrics.median_absolute_error(y_pred=prediction, 
                                          y_true=testLabels)

    R2 = metrics.r2_score(y_pred=prediction, 
                          y_true=testLabels) # Чем лучше данная метрика к 1, тем лучше качество модели
    print(name+" Metrics:")
    print("MSE:   {}\nRMSE:  {}\nR2:    {}\nMAE:   {}\nMedae: {}".format(MSE, RMSE, R2, MAE, MEDIAN))
    return prediction

In [None]:
portugal_ridge = ridge_regression(data_por.copy(), "Portugal")

In [None]:
math_ridge = ridge_regression(data_math.copy(),"Math ")

In [None]:
noG1_por_ridge = ridge_regression(noG1_por.copy(),"Portugal noG1")

In [None]:
noG1_math_ridge = ridge_regression(noG1_math.copy(),"Math noG1")

### Лассо-регрессия

In [None]:
def lasso_regression(dataset,name):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['g3','Subject_le'], axis=1),
                                                dataset['g3'],
                                                test_size=0.33, 
                                                random_state=42)

    lassoModel = linear_model.LassoCV(alphas=np.array(range(1, 10000, 1))/10, 
                                  cv=5)

    lassoModel.fit(trainData, trainLabels)
    
    print(lassoModel.alpha_) 
    prediction = lassoModel.predict(testData)

    plt.plot(prediction[1:100], 'r-', label='Predicted')
    plt.plot(testLabels[1:100].values, 'b-', label='Correct')
    plt.xlabel("Test observations <Lasso> " + name)
    plt.ylabel("Marks")
    plt.legend(loc='best')
    plt.show()
     
    MSE = metrics.mean_squared_error(y_pred=prediction, 
                                     y_true=testLabels) 

    RMSE = np.sqrt(MSE)

    MAE = metrics.mean_absolute_error(y_pred=prediction, 
                                      y_true=testLabels)
    MEDIAN = metrics.median_absolute_error(y_pred=prediction, 
                                          y_true=testLabels)

    R2 = metrics.r2_score(y_pred=prediction, 
                          y_true=testLabels) 
    print(name+" Metrics:")
    print("MSE:   {}\nRMSE:  {}\nR2:    {}\nMAE:   {}\nMedae: {}".format(MSE, RMSE, R2, MAE, MEDIAN))
    return prediction

In [None]:
portugal_lasso = lasso_regression(data_por.copy(),"Portugal ")

In [None]:
math_lasso  = lasso_regression(data_math.copy(),"Math")

In [None]:
noG1_por_lasso  = lasso_regression(noG1_por.copy(), "Portugal noG1")

In [None]:
noG1_math_lasso = lasso_regression(noG1_math.copy(), "Math noG1")

### KNN

In [None]:
from sklearn import preprocessing

In [None]:
def KNN(dataset,name):
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.model_selection import KFold
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['g3','Subject_le'], axis=1),
                                                dataset['g3'],
                                                test_size=0.33, 
                                                random_state=42)

    kf = KFold(n_splits=5, shuffle=True)
    kf.get_n_splits(trainData)
    
    knrModel = KNeighborsRegressor(n_neighbors=5)
    knrModel.fit(trainData, trainLabels)
    prediction = knrModel.predict(testData)

    plt.plot(prediction[1:100], 'r-', label='Predicted')
    plt.plot(testLabels[1:100].values, 'b-', label='Correct')
    plt.xlabel("Test observations <KNN> " + name)
    plt.ylabel("Marks")
    plt.legend(loc='best')
    plt.show()

    MSE = metrics.mean_squared_error(y_pred=prediction, 
                                     y_true=testLabels) 

    RMSE = np.sqrt(MSE)

    MAE = metrics.mean_absolute_error(y_pred=prediction, 
                                      y_true=testLabels)

    MEDIAN = metrics.median_absolute_error(y_pred=prediction, 
                                          y_true=testLabels)

    R2 = metrics.r2_score(y_pred=prediction, 
                          y_true=testLabels) 
    print(name+" Metrics:")
    print("MSE:   {}\nRMSE:  {}\nR2:    {}\nMAE:   {}\nMedae: {}".format(MSE, RMSE, R2, MAE, MEDIAN))
    return prediction

In [None]:
portugal_knn = KNN(data_por.copy(),"Portugal ")

In [None]:
math_knn = KNN(data_math.copy(),"Math ")

In [None]:
noG1_por_KNN = KNN(noG1_por.copy(), "Por noG1")

In [None]:
noG1_math_KNN = KNN(noG1_math.copy(), "Math noG1")

### Radius NN

In [None]:
def RNN(dataset,name,radius):
    from sklearn.neighbors import RadiusNeighborsRegressor
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['g3','Subject_le'], axis=1),
                                                dataset['g3'],
                                                test_size=0.33, 
                                                random_state=42)

    rnrModel = RadiusNeighborsRegressor(radius)
    rnrModel.fit(trainData, trainLabels)
    prediction = rnrModel.predict(testData)
    print(len(prediction))

    MSE = metrics.mean_squared_error(y_pred=prediction, 
                                     y_true=testLabels) 

    RMSE = np.sqrt(MSE)

    MAE = metrics.mean_absolute_error(y_pred=prediction, 
                                      y_true=testLabels)

    MEDIAN = metrics.median_absolute_error(y_pred=prediction, 
                                          y_true=testLabels)

    R2 = metrics.r2_score(y_pred=prediction, 
                          y_true=testLabels)
    print(name+" Metrics:")
    print("MSE:   {}\nRMSE:  {}\nR2:    {}\nMAE:   {}\nMedae: {}".format(MSE, RMSE, R2, MAE, MEDIAN))
    return MSE,RMSE,R2,MAE,MEDIAN,

In [None]:
math_RNN = RNN(data_math.copy(), "Math G1",25)

In [None]:
por_RNN = RNN(data_por.copy(), "Por G1",8)

In [None]:
noG1_math_RNN = RNN(noG1_math.copy(), "Math noG1",25)

In [None]:
noG1_por_RNN = RNN(noG1_por.copy(), "Por noG1",50)

## Итог регрессия
*  Во всех метотах, не исключая параметр G1 получались более точные данные, причем намного, это видно как по данным, так и на графиках. Это подтверждает показатели выше, то что G1 основной признак в определении оценки.
*  Все методы показывают примерно равные результаты 69-75%, кроме метода RNN, где параметр получилось подобрать только у учеников, изучающий португальский, используя G1 с показателем 53%. Осталоные имеют совсем маленькие показатели

# Классификация
## Бинарная классификация
### KNN Classifier

In [None]:
from sklearn import svm, datasets

import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier

In [None]:
dataset_math_bin = noG1_math
dataset_port_bin = noG1_por
dataset_math_bin['binary_classification'] = list(map(lambda x: 1 if x >= 8 else 0, dataset_math_bin['g3']))
dataset_port_bin['binary_classification'] = list(map(lambda x: 1 if x >= 8 else 0, dataset_port_bin['g3']))
dataset_port_bin.drop(['g3','Subject_le'], axis=1, inplace=True)
dataset_math_bin.drop(['g3','Subject_le'], axis=1, inplace=True)

In [None]:
def knn_classifaer(dataset):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['binary_classification'], axis=1),
                                                dataset['binary_classification'],
                                                test_size=0.33, 
                                                random_state=1337)

    kf = KFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(dataset):
        print('Train:', train_index[:10])
        print('Test:', test_index[:10])

    knn = KNeighborsClassifier(n_neighbors=5)
    dm = DummyClassifier(strategy='most_frequent')

    kf = KFold(n_splits=5, shuffle=True)

    scores_knn = []
    scores_dummy = []

    Data = dataset.drop(['binary_classification'], axis=1)
    label = dataset['binary_classification']

    for i in range(10):
        scores_knn.extend(cross_val_score(knn, Data, label, cv=kf, scoring='f1_micro'))
        scores_dummy.extend(cross_val_score(dm, Data, label, cv=kf, scoring='f1_micro'))

    scores_dtree = np.array(scores_knn)
    scores_dummy = np.array(scores_dummy)

    del Data, label

    print('Mean score for KNN:', np.round(np.mean(scores_knn), 5),
      '\nMean score for Dummy:', np.round(np.mean(scores_dummy), 5))

    style.use('seaborn')
    pd.Series(scores_dummy).plot()
    plt.ylabel('f1_micro')
    plt.xlabel('iteration')
    plt.show()

    style.use('seaborn')
    pd.Series(scores_knn).plot()
    plt.ylabel('f1_micro')
    plt.xlabel('iteration')
    plt.show()

    knn = KNeighborsClassifier()
    params = {'n_neighbors': np.arange(1, 20, 2),
             'weights': ['uniform', 'distance']}

    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

    clf = GridSearchCV(knn, params, cv=rskf, scoring='f1_micro')

    clf.fit(dataset.drop(['binary_classification'], axis=1),
            dataset['binary_classification'])
    
    knn_model_math = KNeighborsClassifier(n_neighbors=clf.best_params_.get("n_neighbors"))
    
    print(clf.best_params_.get("n_neighbors"))
    knn_model_math.fit(trainData, trainLabels)

    prediction = knn_model_math.predict(testData)
    print("Accuracy:", round(metrics.accuracy_score(testLabels, prediction), 5),
           '\nBalanced accuracy:', round(metrics.balanced_accuracy_score(testLabels, prediction), 5))
    print(metrics.classification_report(testLabels, prediction))
   
    style.use('classic')
    
    plot_confusion_matrix(metrics.confusion_matrix(testLabels, prediction),
                      target_names=['0', '1'],
                      normalize=False)
    return prediction

In [None]:
math_KNN = knn_classifaer(dataset_math_bin)

In [None]:
port_KNN = knn_classifaer(dataset_port_bin)

 ### Логистическая регрессия

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
def log_regression(dataset):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['binary_classification'], axis=1),
                                                dataset['binary_classification'],
                                                test_size=0.33, 
                                                random_state=42)
    lgr = LogisticRegression()
    lgr.fit(trainData, trainLabels)
    prediction = lgr.predict(testData)
    prediction

 
    style.use('classic')
    metr = metrics.confusion_matrix(testLabels, prediction)
    plot_confusion_matrix(metr, ['0', '1'], False)
    
    preds_prob = lgr.predict_proba(testData)[:,1]
    
    plot_roc(testLabels, preds_prob)
    
    sns.distplot(preds_prob[testLabels == 0], hist=False, rug=False, label='class 0')
    sns.distplot(preds_prob[testLabels == 1], hist=False, rug=False, label='class 1')
    plt.legend()
    plt.show()
    print("Accuracy:", round(metrics.accuracy_score(testLabels, prediction), 5),
      '\nBalanced accuracy:', round(metrics.balanced_accuracy_score(testLabels, prediction), 5))

    print()
    print(metrics.classification_report(testLabels, prediction))
    

In [None]:
math_log = log_regression(dataset_math_bin)

In [None]:
port_log = log_regression(dataset_port_bin)

### Дерево решений.

In [None]:
def decision_tree(dataset):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['binary_classification'], axis=1),
                                                dataset['binary_classification'],
                                                test_size=0.33, 
                                                random_state=42)
    
    decision_tree = DecisionTreeClassifier()
    
    decision_tree.fit(trainData, trainLabels)

    prediction = decision_tree.predict(testData)
    print("Accuracy:",
      round(metrics.accuracy_score(testLabels, prediction), 5),
     '\nBalanced accuracy:',
     round(metrics.balanced_accuracy_score(testLabels, prediction), 5))
    print(metrics.classification_report(testLabels, prediction))
    style.use('classic')
    plot_confusion_matrix(cm=metrics.confusion_matrix(testLabels, prediction),
                      target_names=['0', '1'],
                      normalize=True)

In [None]:
decision_tree(dataset_math_bin)

In [None]:
decision_tree(dataset_port_bin)

Все три метода показали хорошие похожие результаты, но метод Decision Trees в обоих примерах имеет меньше accuracy чем KNN и Логистическая регрессия. Были использованы различные подходы для оценки качества моделей: confusion matrix и производные метрики, roc-кривая и roc auc

## Многоклассовая классификация

In [None]:
def set_mark(g3):
    if g3 >= 18:
        return 5
    elif g3 >= 14:
        return 4
    elif g3 >= 8:
        return 3
    else:
        return 2
data_math_ml =  data_math.copy() 
data_por_ml = data_por.copy()
data_math_ml["final"] = data_math.g3.apply(set_mark)
data_por_ml["final"] = data_por.g3.apply(set_mark)
data_math_ml.drop(["g3","G1"], axis=1)
data_por_ml.drop(["g3","G1"], axis=1);


### KNN

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier

In [None]:
def knn_classifaer_ml(dataset):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['final'], axis=1),
                                                dataset['final'],
                                                test_size=0.33, 
                                                random_state=1337)

    kf = KFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(dataset):
        print('Train:', train_index[:10])
        print('Test:', test_index[:10])
        print('\n')

    knn = KNeighborsClassifier(n_neighbors=5)
    dm = DummyClassifier(strategy='most_frequent')

    kf = KFold(n_splits=5, shuffle=True)

    scores_knn = []
    scores_dummy = []

    Data = dataset.drop(['final'], axis=1)
    label = dataset['final']

    for i in range(10):
        scores_knn.extend(cross_val_score(knn, Data, label, cv=kf, scoring='f1_micro'))
        scores_dummy.extend(cross_val_score(dm, Data, label, cv=kf, scoring='f1_micro'))

    scores_dtree = np.array(scores_knn)
    scores_dummy = np.array(scores_dummy)

    del Data, label

    print('Mean score for KNN:', np.round(np.mean(scores_knn), 5),
      '\nMean score for Dummy:', np.round(np.mean(scores_dummy), 5))

    style.use('seaborn')
    pd.Series(scores_dummy).plot()
    plt.ylabel('f1_micro')
    plt.xlabel('iteration')
    plt.show()

    style.use('seaborn')
    pd.Series(scores_knn).plot()
    plt.ylabel('f1_micro')
    plt.xlabel('iteration')
    plt.show()

    knn = KNeighborsClassifier()
    params = {'n_neighbors': np.arange(1, 20, 2),
             'weights': ['uniform', 'distance']}

    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

    clf = GridSearchCV(knn, params, cv=rskf, scoring='f1_micro')


    clf.fit(dataset.drop(['final'], axis=1),
            dataset['final'])
    

    knn_model_math = KNeighborsClassifier(n_neighbors=clf.best_params_.get("n_neighbors"))
    
    print(clf.best_params_.get("n_neighbors"))
    knn_model_math.fit(trainData, trainLabels)

    prediction = knn_model_math.predict(testData)
    print("Accuracy:", round(metrics.accuracy_score(testLabels, prediction), 5),
           '\nBalanced accuracy:', round(metrics.balanced_accuracy_score(testLabels, prediction), 5))
    print(metrics.classification_report(testLabels, prediction))
   
    style.use('classic')
    
    plot_confusion_matrix(metrics.confusion_matrix(testLabels, prediction),
                      target_names=['2','3','4','5'],
                      normalize=False)

    return prediction

In [None]:
knn_classifaer_ml(data_math_ml.copy());

In [None]:
knn_classifaer_ml(data_por_ml.copy());

### Логистическая регрессия

In [None]:
def log_regression_ml(dataset):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['final'], axis=1),
                                                dataset['final'],
                                                test_size=0.33, 
                                                random_state=42)
    lgr = LogisticRegression()
    lgr.fit(trainData, trainLabels)
    prediction = lgr.predict(testData)

    style.use('classic')
    metr = metrics.confusion_matrix(testLabels, prediction)
    plot_confusion_matrix(metr, ['2','3','4','5'], False)
    
    preds_prob = lgr.predict_proba(testData)[:,1]
        
    sns.distplot(preds_prob[testLabels == 2], hist=False, rug=False, label='mark 2')
    sns.distplot(preds_prob[testLabels == 3], hist=False, rug=False, label='mark 3')
    sns.distplot(preds_prob[testLabels == 4], hist=False, rug=False, label='mark 4')
    sns.distplot(preds_prob[testLabels == 5], hist=False, rug=False, label='mark 5')

    plt.legend()
    plt.show()
    print("Accuracy:", round(metrics.accuracy_score(testLabels, prediction), 5),
      '\nBalanced accuracy:', round(metrics.balanced_accuracy_score(testLabels, prediction), 5))

    print(metrics.classification_report(testLabels, prediction)) 

In [None]:
log_regression_ml(data_math_ml);

In [None]:
log_regression_ml(data_por_ml)

### Дерево решений

In [None]:
def decision_tree_ml(dataset):
    (trainData, 
     testData, 
     trainLabels, 
     testLabels) = train_test_split(dataset.drop(['final'], axis=1),
                                                dataset['final'],
                                                test_size=0.33, 
                                                random_state=42)
    
    decision_tree = DecisionTreeClassifier()
    
    decision_tree.fit(trainData, trainLabels)

    prediction = decision_tree.predict(testData)
    print("Accuracy:",
      round(metrics.accuracy_score(testLabels, prediction), 5),
     '\nBalanced accuracy:',
     round(metrics.balanced_accuracy_score(testLabels, prediction), 5))
    print(metrics.classification_report(testLabels, prediction))
    style.use('classic')
    plot_confusion_matrix(cm=metrics.confusion_matrix(testLabels, prediction),
                      target_names=['2', '3','4','5'],
                      normalize=True)

In [None]:
decision_tree_ml(data_math_ml)

In [None]:
decision_tree_ml(data_por_ml)

KNN дает более точные результаты, чем логистическая регрессия, но оба в принципе неплохи, в отличии от дерева который не дает никакого результата и не имеет смысла

# P.S. надеюсь, когда вы до сюда дойдете, оценки за остальные лабы и контрольные будут уже известны..... Спасибо за год!)