In [None]:
import math
import tqdm
import random
import numpy as np
import pandas as pd

from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder

from collections import defaultdict
from typing import List, Dict, Literal

from scratch.complex_typing import Vector
from scratch.metrics import precision, recall, f1_score

Now we have see how many tools works from scratch we will use many sklearn implementations.

In [2]:
df_raw = pd.read_csv('datasets/titanic_dataset.csv')
df_adjusted = df_raw.copy()

In [3]:
def summary(df):
    """
    Prints a detailed summary of each column in a pandas DataFrame.
    Shows type, missing values, and descriptive statistics for each column.
    """
    print(f"\nDataFrame Summary: {df.shape[0]} rows, {df.shape[1]} columns")
    print("=" * 60)

    for col in df.columns:
        print(f"\nColumn: {col}")
        print(f"Type: {df[col].dtype}")
        print(f"Missing values: {df[col].isna().sum()}")

        if pd.api.types.is_numeric_dtype(df[col]):
            print(f"Mean: {df[col].mean():.5f}")
            print(f"Median: {df[col].median():.5f}")
            print(f"Min: {df[col].min():.5f}")
            print(f"Max: {df[col].max():.5f}")
            print(f"Standard Deviation: {df[col].std():.5f}")
        elif pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
            print("Value counts:")
            print(df[col].value_counts())
        elif pd.api.types.is_bool_dtype(df[col]):
            print("Value counts:")
            print(df[col].value_counts())
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            print(f"Min date: {df[col].min():.5f}")
            print(f"Max date: {df[col].max():.5f}")
        else:
            print("Unrecognized or complex data type.")

    print("\n" + "=" * 60 + "\nEnd of summary.\n")

In [4]:
summary(df_raw)


DataFrame Summary: 891 rows, 12 columns

Column: PassengerId
Type: int64
Missing values: 0
Mean: 446.00000
Median: 446.00000
Min: 1.00000
Max: 891.00000
Standard Deviation: 257.35384

Column: Survived
Type: int64
Missing values: 0
Mean: 0.38384
Median: 0.00000
Min: 0.00000
Max: 1.00000
Standard Deviation: 0.48659

Column: Pclass
Type: int64
Missing values: 0
Mean: 2.30864
Median: 3.00000
Min: 1.00000
Max: 3.00000
Standard Deviation: 0.83607

Column: Name
Type: object
Missing values: 0
Value counts:
Name
Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick           

  elif pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):


In [5]:
df_adjusted.columns = [col_name.strip().lower().replace(" ","_") for col_name in df_adjusted.columns]
predictors_num = ['age', 'sibsp', 'parch','fare']
predictors_cat = ['pclass', 'sex', 'embarked']
target = 'survived'
df_adjusted = df_adjusted[predictors_num + predictors_cat + [target]]

train_data, test_data= train_test_split(df_adjusted, test_size=0.2, random_state=42)
train_data_original = train_data.copy()
test_data_original = test_data.copy()


After this point we already have our data frame splitted into train and test datasets, we also need 2 more transofromations but we will do that over each dataset in different times:
- Inputations
- Get dummies
- Normalization

All this is important because we need an inputer and a StandardScaler for train dataset and we need ti apply exactly the same over the test dataset


We need to inpute information for Age and Embarked we will do specific inputers for that

In [6]:
class GroupByImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_cols, target_col, agg='mean'):
        self.group_cols = group_cols
        self.target_col = target_col
        self.agg = agg

    def fit(self, X, y=None):
        self.fill_values_ = (
            X.groupby(self.group_cols)[self.target_col]
            .agg(self.agg)
            .to_dict()
        )
        return self

    def transform(self, X):
        X = X.copy()
        def get_fill_value(row):
            key = tuple(row[col] for col in self.group_cols)
            return self.fill_values_.get(key, X[self.target_col].mean())
        
        mask = X[self.target_col].isnull()
        X.loc[mask, self.target_col] = X[mask].apply(get_fill_value, axis=1)
        return X

**Imputation**

*Train dataset*

In [7]:
# Apply the imputer we code before for getting ages considering agreggations in "sex" and "pclass"
age_imputer = GroupByImputer(group_cols=['sex','pclass'], target_col='age', agg = 'mean')
age_imputer.fit(train_data)
train_data = age_imputer.transform(train_data)

# Apply a knn imputer for "embarked" this is already available in sklearn modules
# Input embarked
col_to_inpute = 'embarked'
le = LabelEncoder()
not_nan_mask = train_data[col_to_inpute].notna()
le.fit(train_data.loc[not_nan_mask, col_to_inpute])
train_data.loc[not_nan_mask, col_to_inpute] = le.transform(train_data.loc[not_nan_mask, col_to_inpute])

# Convert to float
train_data[col_to_inpute] = train_data[col_to_inpute].astype(float)

# Apply KNN imputer
knn_imputer = KNNImputer(n_neighbors=2)
columns = predictors_num + [col_to_inpute]
knn_imputer.fit(train_data[columns])
imputed = knn_imputer.transform(train_data[columns])

# Replace the encoded column
train_data[col_to_inpute] = imputed[:, columns.index(col_to_inpute)]

# Decode back to original string labels
train_data[col_to_inpute] = train_data[col_to_inpute].round().astype(int)
train_data[col_to_inpute] = le.inverse_transform(train_data[col_to_inpute])


*Test dataset*

In [8]:
# Apply the imputer we code before - We still using the one fitted in train dataset
test_data = age_imputer.transform(test_data)

# Apply a knn imputer for "embarked" this is already available in sklearn modules
# Input embarked
col_to_inpute = 'embarked'
not_nan_mask = test_data[col_to_inpute].notna()
test_data.loc[not_nan_mask, col_to_inpute] = le.transform(test_data.loc[not_nan_mask, col_to_inpute])

# Convert to float
test_data[col_to_inpute] = test_data[col_to_inpute].astype(float)

# Apply KNN imputer
columns = predictors_num + [col_to_inpute]
imputed = knn_imputer.transform(test_data[columns])

# Replace the encoded column
test_data[col_to_inpute] = imputed[:, columns.index(col_to_inpute)]

# Decode back to original string labels
test_data[col_to_inpute] = test_data[col_to_inpute].round().astype(int)
test_data[col_to_inpute] = le.inverse_transform(test_data[col_to_inpute])

**Dummies variables for categorical predictors**

*Train dataset*

In [9]:
train_data = pd.get_dummies(train_data, columns=predictors_cat)
train_data = train_data.replace({True: 1, False: 0})

# Reordenar las columnas: target al final
train_data = train_data[[c for c in train_data.columns if c != target] + [target]]

  train_data = train_data.replace({True: 1, False: 0})


*Test dataset*

In [10]:
# Get dummies
test_data = pd.get_dummies(test_data, columns=predictors_cat)
test_data = test_data.replace({True: 1, False: 0})

# Reordenar las columnas: target al final
test_data = test_data[[c for c in test_data.columns if c != target] + [target]]

  test_data = test_data.replace({True: 1, False: 0})


**Normalization for numerical variables**

*Train dataset*

In [11]:
# Columns to standardize
cols_to_scale = predictors_num

# Build transformer
standar_scaler = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), cols_to_scale)
    ],
    remainder='passthrough'  # keep other columns (e.g. 'gender') untouched
)

# Transform the data
standar_scaler.fit(train_data)
data_scaled = standar_scaler.transform(train_data)

# Get correct column order and names
new_columns = cols_to_scale + [col for col in train_data.columns if col not in cols_to_scale]
train_data = pd.DataFrame(data_scaled, columns=new_columns)

*Test dataset*

In [12]:

# Normalization - We still using the one fitted in train
data_scaled = standar_scaler.transform(test_data)

# Get correct column order and names
new_columns = cols_to_scale + [col for col in test_data.columns if col not in cols_to_scale]
test_data = pd.DataFrame(data_scaled, columns=new_columns)

**Model**

In [13]:
final_predictors = [col for col in train_data.columns if col!='survived']
final_target = 'survived'

# Training data
X_train = train_data[final_predictors]
y_train = train_data[final_target]

# Testing data
X_test = test_data[final_predictors]
y_test = test_data[final_target]

In [None]:
class LogisticRegressionScratch:
    """ 
    A model to represent a logistic regression
    """

    def __init__(self):
        """ 
        Initialize the attributes needed
        """
        self.betas = []
        self.learning_rate = None
        self.epochs = None
        
    def fit(self,
            X: List[Vector], 
            y: Vector, 
            learning_rate: float = 0.001, 
            epochs: int = 1000) -> None:
        """ 
        A method to train the model
        """
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.betas = [random.random() for _ in range(len(X[0]) + 1)]
        pbar = tqdm.tqdm(range(self.epochs), "Training model")
        for epoch in pbar:
            gradient = np.mean([self._gradient([1] + xi, yi) for xi, yi in zip(X,y)], axis=0)
            new_betas = self._gradient_step(gradient)
            self.betas = new_betas
            prob = self._prob(X, y)
            pbar.set_postfix({"prob": f"{prob:.4f}"})
    
    def _prob(self, X: List[Vector], y: Vector) -> float:
        """ 
        Computes the function result we are maximizing
        """
        probabilities = []
        for x_i,y_i in zip(X, y):
            probability = (self._logistic([1] + x_i)**y_i)*((1 - self._logistic([1] + x_i))**(1-y_i))
            probabilities.append(probability)
        
        return np.mean(probabilities).item()
    
    def _gradient_step(self, gradient: Vector)-> Vector:
        """ 
        Make a step in the direction to maximize the function
        """
        return [self.betas[i] + (self.learning_rate*gradient[i]) for i in range(len(self.betas))]
    
    def predict(self, x: Vector, threshold: float = 0.5) -> float:
        """ 
        Method to predict
        """
        z = np.dot(self.betas, [1] + x)
        return 1 if 1/(1 + math.exp(-z)) >= threshold else 0
    
    def _logistic(self, x: Vector) -> float:
        """ 
        Return f(x) -> logistic function
        """
        z = np.dot(self.betas, x)
        return 1/(1 + math.exp(-z))
    
    def _logistic_derivative(self, x:Vector) -> float:
        """ 
        Returns the derivative of the logistic function
        """
        return self._logistic(x) * (1 - self._logistic(x))
    
    def _gradient(self, x:Vector, y:float) -> Vector:
        """ 
        Return the gradient of a point
        The objective function is the probability
        """
        gradient = [
             (y - self._logistic(x))*x[i] for i in range(len(self.betas))
        ]
        return gradient
    

In [None]:
def cm(self, test_dataset: List[LabeledPoint]) -> Dict[str, float]:
    """ Get the confusion matrix of each label
        Cols - Predicted
        Rows - Actual
    """
    real_labels = [lp.label for lp in test_dataset]
    predicted_labels = [self.predict(lp.point) for lp in test_dataset]
    labels = sorted(list(set(real_labels + predicted_labels)), reverse=True)
    cm = confusion_matrix(real_labels, predicted_labels, labels=labels)
    
    cm_detailed = defaultdict(dict)
    for label_index in range(len(labels)):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for row in range(len(labels)):
            for col in range(len(labels)):
                if row==label_index and col==label_index:
                    tp += int(cm[row][col])
                elif row==label_index and col!=label_index:
                    fn += int(cm[row][col])
                elif row!=label_index and col==label_index:
                    fp += int(cm[row][col])
                elif row!=label_index and col!=label_index:
                    tn += int(cm[row][col])

        cm_detailed[labels[label_index]]['tp'] = tp
        cm_detailed[labels[label_index]]['tn'] = tn
        cm_detailed[labels[label_index]]['fp'] = fp
        cm_detailed[labels[label_index]]['fn'] = fn

    return labels, cm, cm_detailed

def metrics(self ,test_dataset: List[LabeledPoint], kind: Literal['micro','macro'] = 'micro') -> Dict[str, float]:
    labels, cm, cm_detailed = self._cm(test_dataset)
    # If just two labels we get the simpler confusion matrix and got metrics
    if len(labels) == 2:
        tp = cm_detailed[labels[0]]['tp']
        fp = cm_detailed[labels[0]]['fp']
        fn = cm_detailed[labels[0]]['fn']
        tn = cm_detailed[labels[0]]['tn']
        _accuracy = float(np.trace(cm)/np.sum(cm))
        _precision = precision(tp, fp)
        _recall = recall(tp, fn)
        _f1_score = f1_score(tp, tn, fp, fn)

        return {
            "labels": labels,
            "confusion_matrix": cm,
            "confusion_matrix_detailes": cm_detailed,
            "accuracy": _accuracy, 
            "precision": _precision, 
            "recall": _recall,
            "f1_score": _f1_score
        }
    
    # If multilabeled predictions we got micro or macro metrics
    if len(labels) > 2:
        _accuracy = float(np.trace(cm)/np.sum(cm))
        if kind == 'micro':
            tp = sum([cm_detailed[label]['tp'] for label in labels])
            fp = sum([cm_detailed[label]['fp'] for label in labels])
            fn = sum([cm_detailed[label]['fn'] for label in labels])
            tn = sum([cm_detailed[label]['tn'] for label in labels])
            _precision = precision(tp, fp)
            _recall = recall(tp, fn)
            _f1_score = f1_score(tp, tn, fp, fn)

        elif kind == 'macro':
            _precision = mean(
                [
                    precision(
                        cm_detailed[label]['tp'],
                        cm_detailed[label]['fp']
                    )
                    for label in labels
                ]
            
            )

            _recall = mean(
                [
                    recall(
                        cm_detailed[label]['tp'],
                        cm_detailed[label]['fn']
                    )
                    for label in labels
                ]
            
            )

            _f1_score = mean(
                [
                    f1_score(
                        cm_detailed[label]['tp'],
                        cm_detailed[label]['tn'],
                        cm_detailed[label]['fp'],
                        cm_detailed[label]['fn']
                    )
                    for label in labels
                ]
            
            )

        else:
            raise AssertionError('Not a valid kind of metrics, use kind = "micro" or kind = "macro"')
        
        return {
            "labels": labels,
            "confusion_matrix": cm,
            "confusion_matrix_detailes": cm_detailed,
            "accuracy": _accuracy, 
            "precision": _precision, 
            "recall": _recall,
            "f1_score": _f1_score
        }



In [15]:
from sklearn.metrics import classification_report
model = LogisticRegressionScratch()
model.fit(np.array(X_train).tolist(), np.array(y_train).tolist(), epochs=10000, learning_rate=0.01)

Training model: 100%|██████████| 10000/10000 [03:51<00:00, 43.20it/s, prob=0.7151]


Report in training data

In [16]:
y_pred = [model.predict(x) for x in np.array(X_train).tolist()]
print("Classification Report:\n", classification_report(np.array(y_train).tolist(), y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.89      0.85       444
         1.0       0.79      0.68      0.73       268

    accuracy                           0.81       712
   macro avg       0.80      0.78      0.79       712
weighted avg       0.81      0.81      0.81       712



Report in testing data

In [24]:

y_pred = [model.predict(x) for x in np.array(X_test).tolist()]
print("Classification Report:\n", classification_report(np.array(y_test).tolist(), y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.87      0.84       105
         1.0       0.79      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



------------------------------

sklearn implementation

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
print("Classification Report:\n", classification_report(y_train, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.89      0.85       444
         1.0       0.79      0.69      0.73       268

    accuracy                           0.81       712
   macro avg       0.81      0.79      0.79       712
weighted avg       0.81      0.81      0.81       712



In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_test, y_test)
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.88      0.88       105
         1.0       0.82      0.82      0.82        74

    accuracy                           0.85       179
   macro avg       0.85      0.85      0.85       179
weighted avg       0.85      0.85      0.85       179

