In [1]:
# импортируем нужные для работы библиотеки
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
mobile_filepath = "./train.csv"

In [3]:
mobile_data = pd.read_csv(mobile_filepath)

In [4]:
mobile_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


## Подготовка данных

In [5]:
target = 'price_range'
categorical_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
numerical_features = [feature for feature in list(mobile_data.columns) if feature != target and feature not in categorical_features]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.base import TransformerMixin

Подготовим пайплайн обработки данных.
Категориальные фичи преобразуем с помощью *ohe (OneHotEncoder)*. Если признак бинарный, то будем оставлять только один столбец для него. Количественные фичи нормализуем, а количественные фичи приведем к интервалу [0; 1]

In [7]:
data_preprocessing = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='if_binary'), categorical_features),
    ('minmax', MinMaxScaler(), [feature for feature in numerical_features])
])

Как видно, датасет имеет 4 класса ценовой категории телефонов (0,1,2,3). Приведу датасет к 2 классам для решения задачи бинарной классификации. Новая ценовая категория 0 будет соответствовать старым классам 0 и 1, а категория 1 - соответственно 2 и 3.

In [8]:
mobile_data.loc[mobile_data.price_range == 1, 'price_range'] = 0
mobile_data.loc[mobile_data.price_range == 2, 'price_range'] = 1
mobile_data.loc[mobile_data.price_range == 3, 'price_range'] = 1

In [9]:
mobile_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,0
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,1
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,1
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,1
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,0


In [10]:
mobile_data['price_range'].value_counts()

0    1000
1    1000
Name: price_range, dtype: int64

Дата-сплиттинг

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = mobile_data.drop(target, axis=1)
y = mobile_data[target]
assert len(X) == len(y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=66)

In [14]:
X_train.shape, X_test.shape

((1400, 20), (600, 20))

## Реализация линейных моделей

### Логистическая регрессия

Реализовал регрессию численным методом через градиентный спуск.

In [15]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import GridSearchCV

In [16]:
class My_log_regression(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs=10, lr=0.1, batch_size=256):
        self.w = None
        self.epochs = epochs
        self.lr = lr
        self.batch_size = batch_size
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        n, k = X.shape
        
        if self.w is None:
            np.random.seed(0xDEAD)
            self.w = np.random.randn(k + 1)

        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        for i in range(self.epochs):
            for j in range(0, len(X), self.batch_size):
                X_batch = X[j:j+self.batch_size]
                y_batch = y[j:j+self.batch_size]
                
                y_pred = self._predict_proba_internal(X_batch)
                self.w -= self.lr * self._get_gradient(X_batch, y_batch, y_pred)

        return self
    
    def _get_gradient(self, X_batch, y_batch, y_pred):
        gradient = X_batch.T @ (y_pred - y_batch)
        return gradient
        
    def predict_proba(self, X):
        X = check_array(X)
        
        n = X.shape[0]
        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        return self._sigmoid(np.dot(X, self.w))

    def _predict_proba_internal(self, X):
        return self._sigmoid(np.dot(X, self.w))

    def predict(self, X, threshold=0.5):
        return self.predict_proba(X) > threshold
    
    def _sigmoid(self, a):
        return 1. / (1 + np.exp(-a))

In [17]:
my_logreg_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('logreg', My_log_regression())
    ]
)

### SVM
Основан на нахождении наиболее правильной линии или гиперплоскости, разделяющую данные на два класса.

In [18]:
from sklearn.model_selection import RandomizedSearchCV

In [19]:
class My_svm(ClassifierMixin, BaseEstimator):
    def __init__(self, epochs=10, lr=0.1, alpha=0.1):
        self.w = None
        self.epochs = epochs
        self.lr = lr
        self.alpha = alpha
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        y = np.where(y == 1, 1, -1)
        n, k = X.shape
        
        if self.w is None:
            np.random.seed(66)
            self.w = np.random.randn(k + 1)

        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        for i in range(self.epochs):
            for j, x in enumerate(X):
                margin = y[j] * np.dot(self.w, x)
                if margin >= 1:
                    self.w -= self.lr * self.alpha * self.w / self.epochs
                else:
                    self.w += self.lr * (y[j] * x - self.alpha * self.w / self.epochs)
        return self
    
    def predict(self, X):
        X = check_array(X)
        n, k = X.shape
        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        y = np.ndarray((n))
        
        for i, elem in enumerate(X):
            prediction = np.dot(self.w, elem)
            if prediction > 0:
                y[i] = 1
            else:
                y[i] = 0
        return y
    
    
    def _hinge_loss(self, x, y):
        return max(0, 1 - y * np.dot(x, self.w))
    
    
    def _soft_margin_loss(self, x, y):
        return self._hinge_loss(x, y) + self.alpha * np.dot(self.w, self.w)

In [20]:
my_svm_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('svm', My_svm())
    ]
)

### KNN
Метод k ближайших соседей основан на подсчёте k объектов, до которых расстояние минимально, а затем классификации объекта, как наиболее встречающийся класс среди этих k объектов.

In [21]:
from sklearn.metrics import euclidean_distances

class My_knn(ClassifierMixin, BaseEstimator):
    def __init__(self, k = 1):
        self.k = k

    def fit(self, X, y):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])
        
        # Input validation
        X = check_array(X)
        
        y = np.ndarray((X.shape[0],))
        for (i, elem) in enumerate(X):
            distances = euclidean_distances([elem], self.X_)[0]
            neighbors = np.argpartition(distances, kth = self. k - 1)
            k_neighbors = neighbors[:self.k]
            labels, cnts = np.unique(self.y_[k_neighbors], return_counts = True)
            y[i] = labels[cnts.argmax()]
        return y

In [22]:
my_knn_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('knn', My_knn())
    ]
)

### Naive Bayes

Наивный алгоритма Байеса заключается в предположении о том, что все признаки имеют нормальное распределение.

In [23]:
class My_naive_Bayes(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        labels, counts = np.unique(y, return_counts=True)
        self.labels = labels
        self.freq = np.array([cnt / y.shape[0] for cnt in counts])
        self.means = np.array([X[y == label].mean(axis = 0) for label in labels])
        self.stds = np.array([X[y == label].std(axis = 0) for label in labels])

        return self
    
    def predict_proba(self, X):
        X = check_array(X)
        y = np.zeros(X.shape[0])
        for i, x in enumerate(X):
            cur_freq = np.array(self.freq)
            for j in range(len(self.labels)):
                p = np.array([self._gaussian(self.means[j][k], self.stds[j][k], x[k]) for k in range(X.shape[1])])
                cur_freq[j] *= np.prod(p)
            y[i] = cur_freq[1]
        return y
    
    def predict(self, X, threshold=0.5):
        return self.predict_proba(X) > threshold
    
    def _gaussian(self, mu, sigma, x0):
        return np.exp(-(x0 - mu) ** 2 / (2 * sigma)) / np.sqrt(2.0 * np.pi * sigma)

In [24]:
my_nb_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('nb', My_naive_Bayes())
    ]
)

## Метрики. Сравнение с моделями из sklearn.

В качестве основных выберу следующие метрики:

    1. Accuracy
    2. Recall
    3. Precision
    4. ROC AUC
    5. Confusion matrix

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt

In [26]:
def get_metrics(model, X, y_true, threshold=0.5, use_probas=True):
    
    if use_probas:
        y_pred_probas = model.predict_proba(X)
        if len(y_pred_probas.shape) == 2:
            y_pred_probas = y_pred_probas[:, 1] 
        y_pred = y_pred_probas > threshold
    else:
        y_pred = model.predict(X)
    
    print('Accuracy = ', accuracy_score(y_true, y_pred))
    print('Precision = ', precision_score(y_true, y_pred))
    print('Recall = ', recall_score(y_true, y_pred))
    if use_probas:
        print('ROC AUC = ', roc_auc_score(y_true, y_pred_probas))
    print('Confusion matrix:')
    print(confusion_matrix(y_true, y_pred))

**Моя логистическая регрессия**

In [27]:
my_logreg_grid_search = GridSearchCV(my_logreg_pipe, {'logreg__lr': [0.001, 0.01, 0.05, 0.1,], 'logreg__epochs': [10, 20, 30, 40]})
my_logreg_grid_search.fit(X_train, y_train);

In [28]:
my_logreg_grid_search.best_params_

{'logreg__epochs': 40, 'logreg__lr': 0.05}

In [29]:
get_metrics(my_logreg_grid_search, X_test, y_test, threshold=0.129)

Accuracy =  0.9433333333333334
Precision =  0.9039039039039038
Recall =  0.9933993399339934
ROC AUC =  0.9973775155293307
Confusion matrix:
[[265  32]
 [  2 301]]


In [30]:
f = open("logreg.pkl", "wb")
pickle.dump(my_logreg_grid_search, f, pickle.HIGHEST_PROTOCOL)
f.close()

**Логистическая регрессия из sklearn**

In [31]:
from sklearn.linear_model import LogisticRegression

sk_logreg_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('logreg', LogisticRegression(class_weight='balanced'))
    ]
)

In [32]:
sk_logreg_grid_search = GridSearchCV(sk_logreg_pipe, {'logreg__penalty': ['none', 'l2'], 'logreg__C': [0.01, 0.1, 1, 10]})
sk_logreg_grid_search.fit(X_train, y_train);



In [33]:
sk_logreg_grid_search.best_params_

{'logreg__C': 0.01, 'logreg__penalty': 'none'}

In [34]:
get_metrics(sk_logreg_grid_search, X_test, y_test, threshold=0.588)

Accuracy =  0.9833333333333333
Precision =  0.9834983498349835
Recall =  0.9834983498349835
ROC AUC =  0.9927659432609928
Confusion matrix:
[[292   5]
 [  5 298]]


In [35]:
f = open("logreg_sklearn.pkl", "wb")
pickle.dump(sk_logreg_grid_search, f, pickle.HIGHEST_PROTOCOL)
f.close()

**Мой SVM**

In [36]:
my_svm_grid_search = RandomizedSearchCV(my_svm_pipe, {'svm__lr': [0.001, 0.01, 0.05, 0.1,],
                                  'svm__epochs': [10, 20, 30, 40],
                                  'svm__alpha': [0.01, 0.1, 1, 10]})
my_svm_grid_search.fit(X_train, y_train);

In [37]:
my_svm_grid_search.best_params_

{'svm__lr': 0.05, 'svm__epochs': 30, 'svm__alpha': 0.01}

In [38]:
get_metrics(my_svm_grid_search, X_test, y_test, use_probas=False)

Accuracy =  0.9533333333333334
Precision =  1.0
Recall =  0.9075907590759076
Confusion matrix:
[[297   0]
 [ 28 275]]


In [39]:
f = open("svm.pkl", "wb")
pickle.dump(my_svm_grid_search, f, pickle.HIGHEST_PROTOCOL)
f.close()

**SVM из sklearn**

In [40]:
from sklearn.svm import LinearSVC

sk_svm_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('svm', LinearSVC(class_weight='balanced'))
    ]
)

In [41]:
sk_svm_grid_search = GridSearchCV(sk_svm_pipe, {'svm__loss': ['hinge', 'squared_hinge'], 'svm__C': [0.01, 0.1, 1, 10]})
sk_svm_grid_search.fit(X_train, y_train);



In [42]:
sk_svm_grid_search.best_params_

{'svm__C': 10, 'svm__loss': 'squared_hinge'}

In [43]:
get_metrics(sk_svm_grid_search, X_test, y_test, use_probas=False)

Accuracy =  0.9833333333333333
Precision =  0.9834983498349835
Recall =  0.9834983498349835
Confusion matrix:
[[292   5]
 [  5 298]]


In [44]:
f = open("svm_sklearn.pkl", "wb")
pickle.dump(sk_svm_grid_search, f, pickle.HIGHEST_PROTOCOL)
f.close()

**Мой KNN**

In [45]:
my_knn_grid_search = GridSearchCV(my_knn_pipe, {'knn__k': [1, 3, 5, 7]})
my_knn_grid_search.fit(X_train, y_train);

In [46]:
my_knn_grid_search.best_params_

{'knn__k': 7}

In [47]:
get_metrics(my_knn_grid_search, X_test, y_test, threshold=0.5, use_probas=False)

Accuracy =  0.7316666666666667
Precision =  0.7448275862068966
Recall =  0.7128712871287128
Confusion matrix:
[[223  74]
 [ 87 216]]


In [48]:
f = open("knn.pkl", "wb")
pickle.dump(my_knn_grid_search, f, pickle.HIGHEST_PROTOCOL)
f.close()

**KNN из sklearn**

In [49]:
from sklearn.neighbors import KNeighborsClassifier

sk_knn_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('knn', KNeighborsClassifier())
    ]
)

In [50]:
sk_knn_grid_search = GridSearchCV(sk_knn_pipe, {'knn__n_neighbors': [1, 3, 5, 7]})
sk_knn_grid_search.fit(X_train, y_train);

In [51]:
sk_knn_grid_search.best_params_

{'knn__n_neighbors': 7}

In [52]:
get_metrics(sk_knn_grid_search, X_test, y_test, threshold=0.5)

Accuracy =  0.7316666666666667
Precision =  0.7448275862068966
Recall =  0.7128712871287128
ROC AUC =  0.8100532275449768
Confusion matrix:
[[223  74]
 [ 87 216]]


In [53]:
f = open("sklearn_knn.pkl", "wb")
pickle.dump(sk_knn_grid_search, f, pickle.HIGHEST_PROTOCOL)
f.close()

**Мой наивный алгоритм Байеса**

In [54]:
my_nb_pipe.fit(X_train, y_train);

In [55]:
get_metrics(my_nb_pipe, X_test, y_test, threshold=7e-12)

Accuracy =  0.505
Precision =  0.505
Recall =  1.0
ROC AUC =  0.7696769676967696
Confusion matrix:
[[  0 297]
 [  0 303]]


In [56]:
f = open("nb.pkl", "wb")
pickle.dump(my_nb_pipe, f, pickle.HIGHEST_PROTOCOL)
f.close()

**Наивный алгоритм Байеса из sklearn**

In [57]:
from sklearn.naive_bayes import GaussianNB

sk_nb_pipe = Pipeline(
    steps=[
        ('preprocessing', data_preprocessing),
        ('nb', GaussianNB())
    ]
)

In [58]:
sk_nb_pipe.fit(X_train, y_train);

In [59]:
get_metrics(sk_nb_pipe, X_test, y_test, threshold=0.93)

Accuracy =  0.8383333333333334
Precision =  1.0
Recall =  0.6798679867986799
ROC AUC =  0.9816981698169817
Confusion matrix:
[[297   0]
 [ 97 206]]


In [60]:
f = open("nb_sklearn.pkl", "wb")
pickle.dump(sk_nb_pipe, f, pickle.HIGHEST_PROTOCOL)
f.close()