In [None]:
import numpy as np
import pandas as pd

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns   

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [None]:
df  = pd.read_csv('titanic.csv', usecols=['Age', 'Fare', 'Survived'])

In [None]:
df.head()

In [None]:
len(df[df.isnull()])

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
df.head()

In [None]:
X = df[['Age', 'Fare']]
y = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)

# sns.distplot(X_train['Age'], kde=True)
sns.histplot(X_train['Age'], kde=True)
# sns.displot(X_train['Age'], kde=True)
plt.title('Age - Before Transformation')

plt.subplot(122)
stats.probplot(X_train['Age'], dist="norm", plot=plt)
plt.title('Age - Q-Q Plot Before Transformation')
plt.show()


In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)

sns.histplot(X_train['Fare'], kde=True)
plt.title('Fare - Before Transformation')

plt.subplot(122)
stats.probplot(X_train['Fare'], dist="norm", plot=plt)
plt.title('Fare - Q-Q Plot Before Transformation')
plt.show()


In [None]:
clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

In [None]:
clf.fit(X_train, y_train)
clf2.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred2 = clf2.predict(X_test)

In [None]:
print("Logistic Regression Accuracy (Before Transformation): ", accuracy_score(y_test, y_pred))
print("Decision Tree Classifier Accuracy (Before Transformation): ", accuracy_score(y_test, y_pred2))

In [None]:
trf = FunctionTransformer(func=np.log1p)

In [None]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [None]:
clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

In [None]:
clf.fit(X_train_trf, y_train)
clf2.fit(X_train_trf, y_train)

In [None]:
y_pred = clf.predict(X_test_trf)
y_pred1 = clf2.predict(X_test_trf)

In [None]:
print("Logistic Regression Accuracy (After Transformation): ", accuracy_score(y_test, y_pred))
print("Decision Tree Classifier Accuracy (Afte Transformation): ", accuracy_score(y_test, y_pred2))

In [None]:
X_trf = trf.fit_transform(X)

In [None]:
clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

In [None]:
print("Logistic Regression Cross-Validation Score: ",np.mean(cross_val_score(clf, X_trf, y, scoring='accuracy', cv=10)))
print("Decision Tree Classifier Cross-Validation Score: ",np.mean(cross_val_score(clf2, X_trf, y, scoring='accuracy', cv=10)))

In [None]:
plt.figure(figsize=(12,5))

plt.subplot(121)
stats.probplot(X_train['Age'], dist="norm", plot=plt)
plt.title('Age - Before Transformation')

plt.subplot(122)
stats.probplot(X_train_trf['Age'], dist="norm", plot=plt)
plt.title('Age - Q-Q Plot Before Transformation')

plt.show()


In [None]:
plt.figure(figsize=(12,5))

plt.subplot(121)
stats.probplot(X_train['Fare'], dist="norm", plot=plt)
plt.title('Fare - Before Transformation')

plt.subplot(122)
stats.probplot(X_train_trf['Fare'], dist="norm", plot=plt)
plt.title('Fare - Q-Q Plot Before Transformation')

plt.show()
