# Code for My heart will go on{-}

In [None]:
import numpy as np
import numpy.typing as npt
import pandas as pd

In [None]:
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url) #training set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url) #test set

In [None]:
train.describe()

## T8{-}

In [None]:
print('median of age is', age_med := train['Age'].median())

In [None]:
train['Age'] = train['Age'].fillna(age_med)

## T9{-}

In [None]:
print('Embarked Mode is', embark_mode := train['Embarked'].mode()[0])

In [None]:
train['Embarked'] = train['Embarked'].fillna(embark_mode)
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2

In [None]:
print('Sex Mode is', sex_mode := train['Sex'].mode()[0])

In [None]:
train['Sex'] = train['Sex'].fillna(sex_mode)
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1

## T10, T11{-}

In [None]:
class LogisticRegressionGradient:
    def __init__(self, lr=0.00001, random_state=42, epochs=10_000, threshold=0.5):
        self.lr = lr
        self.random_state = random_state
        self.epochs = epochs
        self.threshold = threshold

    @staticmethod
    def logist(X: np.array):
        X = np.clip(X, -600, 600) # for overflow
        mask = X >= 0
        X[mask] = np.exp(X[mask]) / (1 + np.exp(X[mask]))
        X[~mask] = 1 / (1 + np.exp(-X[~mask]))
        return X

    def fit(self, X: npt.ArrayLike, y: npt.ArrayLike):
        X = np.array(X)
        y = np.array(y)

        np.random.seed(self.random_state)
        X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))  # add bias

        self.params = np.random.randn(X.shape[1])

        for _ in range(self.epochs):
            y_pred = self.logist(X @ self.params)
            diff = y - y_pred
            loss = X.T @ diff

            self.params += self.lr * loss

        return self

    def predict(self, X: npt.ArrayLike):
        X = np.array(X)
        X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))  # add bias
        return (self.logist(X @ self.params) >= self.threshold).astype(int)

In [None]:
X = np.array(train[["Pclass","Sex","Age","Embarked"]].values, dtype = np.float64)
y = np.array(train['Survived'], dtype=np.float64)

In [None]:
lr = LogisticRegressionGradient()
lr.fit(X, y)

In [None]:
test['Age'] = test['Age'].fillna(age_med)

test['Embarked'] = test['Embarked'].fillna(embark_mode)
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2

test['Sex'] = test['Sex'].fillna(sex_mode)
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1


In [None]:

y_pred = lr.predict(np.array(test[["Pclass","Sex","Age","Embarked"]], dtype=float))

pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred
}).to_csv('Submit.csv', index=False)

## T12{-}

In [None]:
def accuracy_score(y_test, y_pred):
    if y_test.shape[0] != y_pred.shape[0]:
        raise ValueError("Shape are not equal")
    return (y_test == y_pred).sum() / y_test.shape[0]

In [None]:
y_pred = lr.predict(X)
print('Accuracy score of training set is', accuracy_score(y, y_pred))

### Add high order feature $(x_1, x^2_1, x_2 \ldots)${-}

In [None]:
train['Age_squared'] = train['Age'] ** 2
test['Age_squared'] = test['Age'] ** 2
train['Age_Cubic'] = train['Age'] ** 3
test['Age_Cubic'] = test['Age'] ** 3

X_ho_train = np.array(train[["Pclass","Sex","Age", "Age_squared", "Age_Cubic", "Embarked"]].values, dtype = np.float64)
X_ho_test = np.array(test[["Pclass","Sex","Age", "Age_squared", "Age_Cubic", "Embarked"]].values, dtype = np.float64)

lr_ho = LogisticRegressionGradient().fit(X_ho_train, y)
y_pred_ho_train = lr_ho.predict(X_ho_train)

print(lr_ho.params)

print('Accuracy score of training set with high order feature is', accuracy_score(y, y_pred_ho_train))

In [None]:
y_pred_ho_test = lr_ho.predict(X_ho_test)
pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred_ho_test
}).to_csv('Submit_highorder.csv', index=False)

## T13{-}

In [None]:
X_train = np.array(train[["Sex", "Age"]].values, dtype = np.float64)
X_test = np.array(test[["Sex", "Age"]].values, dtype = np.float64)

lr_sa = LogisticRegressionGradient().fit(X_train, y)
y_pred_sa_train = lr_sa.predict(X_train)

print(lr_sa.params)
print('Accuracy score of training set with only Sex and Age is', accuracy_score(y, y_pred_sa_train))

In [None]:
y_pred_sa = lr_sa.predict(X_test)
pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_pred_sa
}).to_csv('Submit_Sex_Age.csv', index=False)

## OT3{-}

In [None]:
print(X)

normalized Age

In [None]:
mx_age, mn_age = X[:, 2].max(), X[:, 2].min()

def normalize_age(x, mx_age, mn_age):
    return (x - mn_age) / (mx_age - mn_age)

normalize_age_vectorized = np.vectorize(lambda x : normalize_age(x, mx_age, mn_age))
X[:, 2] = normalize_age_vectorized(X[:, 2])
print(X)

In [None]:
class LinearRegressionGradient:
    def __init__(self, lr=0.001, random_state=42, epochs=200_000):
        self.lr = lr
        self.random_state = random_state
        self.epochs = epochs
        self.params = None

    def fit(self, X: npt.ArrayLike, y: npt.ArrayLike):
        np.random.seed(self.random_state)
        X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))  # add bias
        self.params = np.random.randn(X.shape[1])

        for _ in range(self.epochs):
            y_pred = X @ self.params
            diff = y - y_pred
            loss = X.T @ diff

            self.params += self.lr / X.shape[0] * loss
            if _ % 10000 == 0:
                print("iterations :", _)

        return self

    def predict(self, X: npt.ArrayLike):
        X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))  # add bias
        return X @ self.params

In [None]:
params_gradient = LinearRegressionGradient(random_state=0).fit(X, y).params
params_gradient

## OT4{-}

In [None]:
class LinearRegressionInversion:
    def __init__(self):
        self.params = None


    def fit(self, X: npt.ArrayLike, y: npt.ArrayLike):
        X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))  # add bias

        self.params = np.linalg.inv(X.T @ X) @ X.T @ y
        return self

    def predict(self, X: npt.ArrayLike):
        X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))  # add bias

        return X @ self.params

In [None]:
params_matrix_inversion =  LinearRegressionInversion().fit(X, y).params
params_matrix_inversion

Compute MSE

In [None]:
np.power(params_gradient - params_matrix_inversion, 2).sum()