# Оценка точности модели с использованием тестовой и обучающей выборки

In [5]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
filename = 'git/data/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
dataframe.head(10)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [8]:
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=test_size, random_state=seed)

In [9]:
model = LogisticRegression(solver = 'liblinear')

In [10]:
model.fit(X_train, Y_train)

LogisticRegression(solver='liblinear')

In [11]:
result = model.score(X_test, Y_test)

In [13]:
print('Accuracy: %.3f%%' %(result*100))

Accuracy: 75.591%


# Кросс-валидация (мало данных)

In [14]:
from sklearn.model_selection import KFold, cross_val_score

In [15]:
kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)

In [16]:
model = LogisticRegression(solver = 'liblinear')

In [17]:
results = cross_val_score(model, X, Y, cv = kfold)

In [19]:
print('Accuracy: %.3f%% (%.3f%%)' %(results.mean()*100, results.std()*100))

Accuracy: 77.086% (5.091%)


# Кросс-валидация по отдельным объектам

In [20]:
from sklearn.model_selection import LeaveOneOut

In [21]:
looCV = LeaveOneOut()

In [22]:
model = LogisticRegression(solver = 'liblinear')

In [23]:
res = cross_val_score(model, X, Y, cv = looCV)

In [24]:
print('Accuracy: %.3f%% (%.3f%%)' %(res.mean()*100, res.std()*100))

Accuracy: 76.823% (42.196%)


# Кросс-валидация с элементом случайности

In [25]:
from sklearn.model_selection import ShuffleSplit

In [1]:
n_splits = 10
test_size = 0.33
speed = 7

In [27]:
kfold = ShuffleSplit(n_splits = n_splits, test_size = test_size, random_state = speed)

In [28]:
model = LogisticRegression(solver = 'liblinear')

In [29]:
resul = cross_val_score(model, X, Y, cv = kfold) 

In [30]:
print('Accuracy: %.3f%% (%.3f%%)' %(resul.mean()*100, resul.std()*100))

Accuracy: 76.496% (1.698%)


# Метрики качества моделей

## Задача классификации

Оценка логистической функции потерь

In [31]:
kFold = KFold(n_splits = 10, random_state = 7, shuffle = True)

In [32]:
model = LogisticRegression(solver = 'liblinear')

In [33]:
scoring = 'neg_log_loss'

In [34]:
resultat = cross_val_score(model, X, Y, cv = kFold, scoring = scoring) 

In [36]:
print('Logloss: %.3f (%.3f)' %(resultat.mean(), resultat.std()))

Logloss: -0.494 (0.042)


Площадь под ROC-кривой

In [37]:
scoring = 'roc_auc'

In [38]:
re = cross_val_score(model, X, Y, cv = kFold, scoring = scoring) 

In [40]:
print('AUC: %.3f (%.3f)' %(re.mean(), re.std()))

AUC: 0.826 (0.050)


Матрица ошибок

In [41]:
from sklearn.metrics import confusion_matrix #матрица ошибок

In [42]:
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=test_size, random_state=seed)

In [43]:
model = LogisticRegression(solver = 'liblinear')

In [44]:
model.fit(X_train, Y_train)

LogisticRegression(solver='liblinear')

In [45]:
predicted = model.predict(X_test)

In [52]:
matrix = confusion_matrix(Y_test,predicted)
print(matrix)

[[141  21]
 [ 41  51]]


строки - предсказание, столбцы - известные данные. главная диагональ - данные, предсказанные верно, остальные элементы не совпали.
первый столбец/строка - значения False, второй столбец/строка - значения True

## Отчет по классификации

In [47]:
from sklearn.metrics import classification_report

In [53]:
report = classification_report(Y_test, predicted)
print(report)

              precision    recall  f1-score   support

         0.0       0.77      0.87      0.82       162
         1.0       0.71      0.55      0.62        92

    accuracy                           0.76       254
   macro avg       0.74      0.71      0.72       254
weighted avg       0.75      0.76      0.75       254



# Задача регрессии

## Средняя абсолютная ошибка MAE

In [54]:
from sklearn.linear_model import LinearRegression

In [56]:
filename = 'git/data/housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
'B', 'LSTAT', 'MEDV']
dataframe = read_csv(filename, delim_whitespace=True, names=names)
dataframe.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.1,18.9


In [57]:
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]

In [58]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("MAE: %.3f (%.3f)" % (results.mean(), results.std()))

MAE: -3.387 (0.667)


## Cross Validation Regression MAE

In [59]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("MSE: %.3f (%.3f)" % (results.mean(), results.std()))

MSE: -23.747 (11.143)


## R^2

In [60]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'r2'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("R^2: %.3f (%.3f)" % (results.mean(), results.std()))

R^2: 0.718 (0.099)
