#### Чтение csv-файла и вывод информации по нему

In [40]:
import numpy as np
import pandas as pd

In [41]:
df = pd.read_csv("diabetes.csv")

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


#### Предварительная обработка датасета

##### Вывод количества отсутствующих значений по каждой колонке датасета

In [43]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

В датасете нет пустых значений, следовательно, обрабатывать нечего :)

##### Поиск категориальных признаков

In [44]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

В датасете нет категориальных признаков, следовательно, кодировать нечего :))

##### Стандартизация

In [45]:
for column in df:
	if column == "Outcome": continue
	column_mean, column_std = df[column].mean(), df[column].std()
	df[column] = (df[column] - column_mean) / column_std
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639530,0.847771,0.149543,0.906679,-0.692439,0.203880,0.468187,1.425067,1
1,-0.844335,-1.122665,-0.160441,0.530556,-0.692439,-0.683976,-0.364823,-0.190548,0
2,1.233077,1.942458,-0.263769,-1.287373,-0.692439,-1.102537,0.604004,-0.105515,1
3,-0.844335,-0.997558,-0.160441,0.154433,0.123221,-0.493721,-0.920163,-1.040871,0
4,-1.141108,0.503727,-1.503707,0.906679,0.765337,1.408828,5.481337,-0.020483,1
...,...,...,...,...,...,...,...,...,...
763,1.826623,-0.622237,0.356200,1.721613,0.869464,0.115094,-0.908090,2.530487,0
764,-0.547562,0.034575,0.046215,0.405181,-0.692439,0.609757,-0.398023,-0.530677,0
765,0.342757,0.003299,0.149543,0.154433,0.279412,-0.734711,-0.684747,-0.275580,0
766,-0.844335,0.159683,-0.470426,-1.287373,-0.692439,-0.240048,-0.370859,1.169970,1


##### Деление датасета на тренировочный и тестовый

In [46]:
from sklearn.model_selection import train_test_split

Y, X = df["Outcome"], df.drop(columns=["Outcome"])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

#### Регрессия

In [47]:
import warnings
from sklearn.metrics import accuracy_score

class CustomLogisticRegression():
	def __init__(self, learning_rate=0.01, n_iter=250, method="g"):
		self.weights = None
		self.bias = None
		self.train_accuracy = None
		self.loss = None
		self.learning_rate = learning_rate
		self.n_iter = n_iter
		self.method = method
	
	def fit(self, X, Y):
		self.weights = np.zeros(X.shape[1])
		self.bias = 0
		for _ in range(self.n_iter):
			weights = np.dot(self.weights, X.T) + self.bias
			pred = self._get_sigmoid(weights)
			match self.method:
				case "g": error_weights, error_bias = self._calculate_gradients(X, Y, pred)
				case "n": error_weights, error_bias = self._calculate_newton(X, Y, pred)
			self.weights = self.weights - self.learning_rate * error_weights
			self.bias = self.bias - self.learning_rate * error_bias

		pred_to_class = [1 if p > 0.5 else 0 for p in pred]
		self.train_accuracy = accuracy_score(Y, pred_to_class)
		self.loss = self._calculate_loss(Y, pred)
	
	def predict(self, x):
		x_dot_weights = np.dot(x, self.weights.T) + self.bias
		probabilities = self._get_sigmoid(x_dot_weights)
		return [1 if p > 0.5 else 0 for p in probabilities]
	
	def _calculate_loss(self, y_true, y_pred):
		y_zero_loss = y_true * np.log(y_pred + 1e-30)
		y_one_loss = (1-y_true) * np.log(1 - y_pred + 1e-30)
		return -np.mean(y_zero_loss + y_one_loss)
	
	def _get_sigmoid(self, x):
		warnings.simplefilter('ignore')
		return 1 / (1 + np.exp(-x))
	
	def _calculate_gradients(self, X, Y, pred):
		difference = pred - Y
		gradients_weights = np.dot(X.T, difference)
		gradient_bias = np.mean(difference)
		return gradients_weights, gradient_bias
	
	def _calculate_newton(self, X, Y, pred):
		difference = pred - Y
		gradient = np.dot(X.T, difference)

		W = np.array(pred * (1 - pred))
		hessian = X.T @ W @ X.T

		try:
			inv_hessian = np.linalg.inv(hessian)
			error_weights = np.dot(inv_hessian, gradient)
			error_bias = np.sum(difference)
		except np.linalg.LinAlgError:  # Вырожденный гессиан
			error_weights = gradient
			error_bias = np.sum(difference)

		return error_weights, error_bias

#### Вывод оценок модели

In [66]:
def get_confusion_matrix(Y_test, pred):
	confusion_matrix = np.zeros((2, 2), dtype=int)
	for index, value in enumerate(list(Y_test)):
		confusion_matrix[value][pred[index]] += 1
	return confusion_matrix

In [68]:
def get_scores(Y_test, pred):
	confusion_matrix = get_confusion_matrix(Y_test, pred)
	scores = dict()
	accuracy = sum([confusion_matrix[i][i] for i in range(len(confusion_matrix))]) / len(Y_test)
	for i in range(len(confusion_matrix)):
		precision = confusion_matrix[i][i] / sum(confusion_matrix[i])
		recall = confusion_matrix[i][i] / sum([confusion_matrix[j][i] for j in range(len(confusion_matrix))])
		f1_score = 2 * precision * recall / (precision + recall)
	scores['accuracy'], scores['precision'], scores['recall'], scores['f1_score'] = accuracy, precision, recall, f1_score
	return scores

In [69]:
methods = ('g', 'n')
n_iters = (10, 100, 1000)
learning_rates = (0.1, 0.01, 0.001)

for method in methods:
	print("Метод", "градиентного спуска" if method == "g" else "оптимизации Ньютона")
	df_dict = dict()
	for score in ('accuracy', 'precision', 'recall', 'f1_score'):
		df_dict[score] = np.zeros((3, 3))
	for i, n_iter in enumerate(n_iters):
		for j, learning_rate in enumerate(learning_rates):
			log = CustomLogisticRegression(n_iter=n_iter, learning_rate=learning_rate, method=method)
			log.fit(X_train, Y_train)
			pred = log.predict(X_test)
			scores = get_scores(Y_test, pred)
			for score, value in scores.items():
				df_dict[score][i][j] = value
	
	for key, df in df_dict.items():
		df = pd.DataFrame(df, index=[str(iter) for iter in n_iters], columns=[str(rate) for rate in learning_rates])
		print(f"{key.upper()}:")
		print(df)
		print()
	print()

Метод градиентного спуска
ACCURACY:
           0.1      0.01     0.001
10    0.722944  0.740260  0.740260
100   0.718615  0.744589  0.740260
1000  0.740260  0.783550  0.744589

PRECISION:
           0.1      0.01     0.001
10    0.729730  0.783784  0.770270
100   0.743243  0.729730  0.783784
1000  0.621622  0.581081  0.729730

RECALL:
           0.1      0.01     0.001
10    0.551020  0.568627  0.570000
100   0.544554  0.580645  0.568627
1000  0.589744  0.693548  0.580645

F1_SCORE:
           0.1      0.01     0.001
10    0.627907  0.659091  0.655172
100   0.628571  0.646707  0.659091
1000  0.605263  0.632353  0.646707


Метод оптимизации Ньютона
ACCURACY:
           0.1      0.01     0.001
10    0.718615  0.779221  0.744589
100   0.701299  0.779221  0.779221
1000  0.701299  0.779221  0.779221

PRECISION:
           0.1      0.01     0.001
10    0.648649  0.527027  0.513514
100   0.540541  0.527027  0.527027
1000  0.540541  0.527027  0.527027

RECALL:
           0.1      0.01     0.00