In [31]:
import numpy as np
import pandas as pd
import scipy.linalg as sla
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix,classification_report,precision_score, plot_roc_curve, plot_precision_recall_curve, balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn import metrics


In [2]:
def logit(x, w):
    return np.dot(x, w)

def sigmoid(h):
    return 1. / (1 + np.exp(-h))

class MyLogisticRegression(object):
    def __init__(self):
        self.w = None
    
    def fit(self, X, y, max_iter=100, lr=0.1):
        # Принимает на вход X, y и вычисляет веса по данной выборке.
        # Множество допустимых классов: {1, -1}
        # Не забудьте про фиктивный признак равный 1!
        
        n, k = X.shape
        
        if self.w is None:
            self.w = np.random.randn(k + 1)
        
        X_train = np.concatenate((np.ones((n, 1)), X), axis=1)
        
        losses = []
        
        for iter_num in range(max_iter):
            z = sigmoid(logit(X_train, self.w))
            grad = np.dot(X_train.T, (z - y)) / len(y)

            self.w -= grad * lr

            losses.append(self.__loss(y, z))
        
        return losses
        
    def predict_proba(self, X):
        # Принимает на вход X и возвращает ответы модели
        n, k = X.shape
        X_ = np.concatenate((np.ones((n, 1)), X), axis=1)
        return sigmoid(logit(X_, self.w))

    def predict(self, X, threshold=0.5):
        return self.predict_proba(X) >= threshold
    
    def get_weights(self):
        return self.w
      
    def __loss(self, y, p):
        p = np.clip(p, 1e-10, 1 - 1e-10)
        return np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))

In [4]:
df = pd.read_csv('water_potability.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [5]:
df.sample(5)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
1896,7.044375,229.603173,12575.279627,7.837778,390.467684,558.176592,15.754785,36.26753,4.276699,1
1549,8.051022,171.751754,28338.66404,6.392121,319.005071,496.906266,16.19287,91.123219,3.767443,1
995,8.31238,203.744548,8727.247349,7.456302,,543.392988,15.4704,81.508682,2.988093,0
456,6.256651,181.072307,16905.801524,11.586151,,481.307052,10.639743,74.557241,4.181051,0
808,,182.582228,19575.475363,10.048383,424.688994,383.490954,19.482945,76.634798,4.329204,1


In [7]:
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [9]:
df[df['Sulfate'].isnull()]
df[df['ph'].isnull()]
df[df['Trihalomethanes'].isnull()]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
62,,229.485694,35729.692709,8.810843,384.943779,296.397547,16.927092,,3.855602,0
81,5.519126,168.728583,12531.601921,7.730723,,443.570372,18.099078,,3.758996,0
110,9.286155,222.661551,12311.268366,7.289866,332.239359,353.740100,14.171763,,5.239982,0
118,7.397413,122.541040,8855.114121,6.888689,241.607532,489.851600,13.365906,,3.149158,0
119,7.812804,196.583886,42550.841816,7.334648,,442.545775,14.666917,,6.204846,0
...,...,...,...,...,...,...,...,...,...,...
3174,6.698154,198.286268,34675.862845,6.263602,360.232834,430.935009,12.176678,,3.758180,1
3185,6.110022,234.800957,16663.539074,5.984536,348.055211,437.892115,10.059523,,2.817780,1
3219,6.417716,209.702425,31974.481631,7.263425,321.382124,289.450118,11.369071,,4.210327,1
3259,9.271355,181.259617,16540.979048,7.022499,309.238865,487.692788,13.228441,,4.333953,1


In [10]:
df['ph']=df['ph'].fillna(df.groupby(['Potability'])['ph'].transform('mean'))
df['Sulfate']=df['Sulfate'].fillna(df.groupby(['Potability'])['Sulfate'].transform('mean'))
df['Trihalomethanes']=df['Trihalomethanes'].fillna(df.groupby(['Potability'])['Trihalomethanes'].transform('mean'))

In [11]:
df.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [12]:
df['Potability'].value_counts()


0    1998
1    1278
Name: Potability, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x = df.drop("Potability", axis=1)
y = df.Potability
X_train , X_test , y_train , y_test = train_test_split(x , y, test_size=0.25, random_state=42)

In [16]:
model = MyLogisticRegression()

In [25]:
model.fit(X_train, y_train)
w = model.get_weights()

  """


In [26]:
w

array([-4.02350148e-01, -6.82924883e+00, -3.71343377e+02,  5.17555045e+02,
        9.89766745e+00, -7.06395507e+02, -5.37416344e+02, -5.21613905e+01,
        6.38761577e+01, -2.48398908e+00])

In [27]:
y_pred = model.predict(X_train)

  """


In [29]:
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay

In [33]:
print("Accuracy Score:", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:", metrics.log_loss(y_train, y_pred))
print("Precision Score:", metrics.precision_score(y_train, y_pred))
print("Recall Score:", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:", metrics.roc_auc_score(y_train, y_pred))

Accuracy Score: 0.393976393976394
F1 Score: 0.5652554744525548
Average Precision Score: 0.3941370622928897
Log Loss: 20.93179806796831
Precision Score: 0.3941368078175896
Recall Score: 0.9989680082559339
ROC-AUC Score: 0.49948400412796695
