In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
data = pd.read_csv("df_filtered.csv")
import pickle

In [2]:
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score

In [3]:
data.head()

Unnamed: 0,average_user_rating,average_user_rating_for_current_version,file_size_bytes,is_game_center_enabled,price,primary_genre_id,user_rating_count
0,4.67589,4.67589,36927488,0,0.0,6014,253
1,4.86087,4.86087,154926080,1,0.0,6014,253100
2,4.81924,4.81924,249300992,1,0.0,6014,2076657
3,4.52683,4.52683,243209216,0,0.0,6014,54892
4,4.7413,4.7413,323801088,0,0.0,6014,6150


In [4]:
X = data.drop('is_game_center_enabled', axis = 1)
Y = data.is_game_center_enabled

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.1, random_state = 0)

In [6]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1800, 6), (200, 6), (1800,), (200,))

# Для работы были выбраны следущие модели

## Nearest Neighbors Classification

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [8]:
knn = KNeighborsClassifier(n_neighbors=5)

In [9]:
knn.fit(x_train,y_train)

In [10]:
predict = knn.predict(x_test)

In [11]:
print(confusion_matrix(y_test,predict))
print('\n',classification_report(y_test,predict))
print('roc_auc_score:',roc_auc_score(y_test,predict))

[[120  20]
 [ 45  15]]

               precision    recall  f1-score   support

           0       0.73      0.86      0.79       140
           1       0.43      0.25      0.32        60

    accuracy                           0.68       200
   macro avg       0.58      0.55      0.55       200
weighted avg       0.64      0.68      0.65       200

roc_auc_score: 0.5535714285714287


In [22]:
neighbors = np.arange(3,150,2)
params = dict(n_neighbors = neighbors)
params

{'n_neighbors': array([  3,   5,   7,   9,  11,  13,  15,  17,  19,  21,  23,  25,  27,
         29,  31,  33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,
         55,  57,  59,  61,  63,  65,  67,  69,  71,  73,  75,  77,  79,
         81,  83,  85,  87,  89,  91,  93,  95,  97,  99, 101, 103, 105,
        107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131,
        133, 135, 137, 139, 141, 143, 145, 147, 149])}

In [23]:
knn = KNeighborsClassifier()
randomized_search = RandomizedSearchCV(knn,params,scoring='accuracy',n_jobs = -1,refit = True)
randomized_search.fit(x_train,y_train)

In [24]:
randomized_search.best_params_

{'n_neighbors': 29}

In [25]:
predict = randomized_search.predict(x_test)

In [26]:
print(confusion_matrix(y_test,predict))
print('\n',classification_report(y_test,predict))
print('roc_auc_score:',roc_auc_score(y_test,predict))

[[130  10]
 [ 45  15]]

               precision    recall  f1-score   support

           0       0.74      0.93      0.83       140
           1       0.60      0.25      0.35        60

    accuracy                           0.72       200
   macro avg       0.67      0.59      0.59       200
weighted avg       0.70      0.72      0.68       200

roc_auc_score: 0.5892857142857143


In [28]:
knn = KNeighborsClassifier(n_neighbors=29)
knn.fit(x_train,y_train)
predict = knn.predict(x_test)

print(confusion_matrix(y_test,predict))
print('\n',classification_report(y_test,predict))
print('roc_auc_score:',roc_auc_score(y_test,predict))

[[130  10]
 [ 45  15]]

               precision    recall  f1-score   support

           0       0.74      0.93      0.83       140
           1       0.60      0.25      0.35        60

    accuracy                           0.72       200
   macro avg       0.67      0.59      0.59       200
weighted avg       0.70      0.72      0.68       200

roc_auc_score: 0.5892857142857143


In [33]:
with open("NearestNeighborsClassifier.pickle",'wb') as f:
    pickle.dump(knn,f)

## Polynomial regression: extending linear models with basis functions

In [34]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error,r2_score
from math import sqrt

In [36]:
def metrics(y_test,y_pred):
    print(f'MAE: {mean_absolute_error(y_test,y_pred)}')
    print(f'MSE: {mean_squared_error(y_test,y_pred)}')
    print(f'RMAE: {sqrt(mean_absolute_error(y_test,y_pred))}')
    print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test,y_pred))}')
    print(f'R^2: {r2_score(y_test,y_pred)}')

In [37]:
p = PolynomialFeatures()
X_train_p = p.fit_transform(x_train)
X_test_p = p.fit_transform(x_test)
parameters = {'alpha': np.arange(0,2.1,0.1)}

In [39]:
lr = LinearRegression().fit(X_train_p,y_train)
y_pred = lr.predict(X_test_p)
metrics(y_test,y_pred)

MAE: 0.38651475592548723
MSE: 0.19735240635644347
RMAE: 0.6217031091489629
MAPE: 28827407.765157875
R^2: 0.06022663639788817


In [44]:
with open("PolynomialRegression.pickle",'wb') as f:
    pickle.dump(lr,f)