In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [1]:
!pip install catboost -q

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
df = pd.read_excel("/content/diabetes (1).xlsx")  # sep="\t" для табуляции
print(df.head())

   Беременность  Глюкоза  АД  Толщина КС  Инсулин   ИМТ  Наследственность  \
0             6      148  72        35.0      0.0  33.6             0.627   
1             1       85  66        29.0      0.0  26.6             0.351   
2             8      183  64         0.0      0.0  23.3             0.672   
3             1       89  66        23.0     94.0  28.1             0.167   
4             0      137  40        35.0    168.0  43.1             2.288   

   Возраст  Диагноз  
0       50        1  
1       31        0  
2       32        1  
3       21        0  
4       33        1  


In [47]:
df.shape

(768, 9)

In [None]:
df["Наследственность"]

Unnamed: 0,Наследственность
0,0.627
1,0.351
2,0.672
3,0.167
4,2.288
...,...
763,0.171
764,0.34
765,0.245
766,0.349


In [48]:
df.dtypes

Unnamed: 0,0
Беременность,int64
Глюкоза,int64
АД,int64
Толщина КС,float64
Инсулин,float64
ИМТ,float64
Наследственность,float64
Возраст,int64
Диагноз,int64


In [4]:
X = df.drop(columns=['Диагноз']).values.astype(float)
y = df['Диагноз'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=70, random_state=42)

In [5]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def fit_gd(X, y, lr=0.1, epochs=10000, tol=1e-6):
    n, m = X.shape
    Xb = np.hstack([np.ones((n, 1)), X])
    w = np.zeros(m + 1)
    prev_loss = np.inf

    for _ in range(epochs):
        z = Xb.dot(w)
        p = sigmoid(z)
        eps = 1e-12
        loss = -np.mean(y * np.log(p + eps) + (1 - y) * np.log(1 - p + eps))
        grad = Xb.T.dot(p - y) / n
        w -= lr * grad
        if abs(prev_loss - loss) < tol:
            break
        prev_loss = loss
    return w

In [7]:
w = fit_gd(X, y, lr=0.5, epochs=20000)
print("Найденные коэффициенты (градиентный спуск):", w)

  return 1 / (1 + np.exp(-z))


Найденные коэффициенты (градиентный спуск): [-336.50357852  154.19821469   21.25701915  -37.11911031   -0.52750835
    2.94347277    9.74125593   78.31777378  -12.54150199]


In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)  # просто инициализация

model.intercept_ = np.array([w[0]])
model.coef_ = np.array([w[1:]])

print("Интерсепт sklearn:", model.intercept_)
print("Коэффициенты sklearn:", model.coef_)


Интерсепт sklearn: [-336.50357852]
Коэффициенты sklearn: [[154.19821469  21.25701915 -37.11911031  -0.52750835   2.94347277
    9.74125593  78.31777378 -12.54150199]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
y_prob = model.predict_proba(X_test)[:, 1]  # берем вероятность класса 1

mse = mean_squared_error(y_test, y_prob)
print("MSE на всей выборке:", mse)

MSE на всей выборке: 0.40301310001580226


In [10]:
feature_cols = [c for c in df.columns if c != "Диагноз"]
m = len(feature_cols)

corr_abs = df[feature_cols + ["Диагноз"]].corr()["Диагноз"].abs().drop("Диагноз")
corr_abs = corr_abs.sort_values(ascending=False)
k = m - 2
selected_features = list(corr_abs.index[:k])
print("Выбранные признаки:", selected_features)
X_sel = df[selected_features].values

Выбранные признаки: ['Глюкоза', 'ИМТ', 'Возраст', 'Беременность', 'Наследственность', 'Инсулин']


In [11]:
model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, cat_features=[])
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=200)

y_pred = model.predict(X_test)

y_prob = model.predict_proba(X_test)[:, 1]

mse = mean_squared_error(y_test, y_prob)
print(f"MSE на тестовой выборке для CatBoost: {mse:.4f}")

0:	learn: 0.6613750	test: 0.6769265	best: 0.6769265 (0)	total: 49.4ms	remaining: 49.4s
200:	learn: 0.0195069	test: 0.6759879	best: 0.5170158 (32)	total: 316ms	remaining: 1.25s
400:	learn: 0.0079949	test: 0.7970169	best: 0.5170158 (32)	total: 457ms	remaining: 683ms
600:	learn: 0.0051155	test: 0.8663639	best: 0.5170158 (32)	total: 599ms	remaining: 398ms
800:	learn: 0.0037525	test: 0.9129602	best: 0.5170158 (32)	total: 754ms	remaining: 187ms
999:	learn: 0.0029185	test: 0.9456608	best: 0.5170158 (32)	total: 896ms	remaining: 0us

bestTest = 0.5170158275
bestIteration = 32

Shrink model to first 33 iterations.
MSE на тестовой выборке для CatBoost: 0.1723
