In [None]:
import numpy  as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer


In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.sample(3)

In [None]:
usecols = ['Age', 'Fare','Survived']
df = df[usecols]

In [None]:
df.sample(3)

In [None]:
df.isnull().sum()

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
x = df.iloc[:,0:2]
y = df.iloc[:,2]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1,2,1)
sns.distplot(X_train['Age'])
plt.title('Age PDF')
plt.subplot(1,2,2)
stats.probplot(X_train['Age'],dist='norm', plot=plt)
plt.title('Age QQ Plot')
plt.show()

In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1,2,1)
sns.distplot(X_train['Fare'])
plt.title('Age PDF')
plt.subplot(1,2,2)
stats.probplot(X_train['Fare'],dist='norm', plot=plt)
plt.title('Age QQ Plot')
plt.show()

In [None]:
clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

In [None]:
clf.fit(X_train, y_train)
clf2.fit(X_train, y_train)

y_predict1 = clf.predict(X_test)
y_predict2 = clf2.predict(X_test)

print('Logistic Regression Accuracy: ', accuracy_score(y_test, y_predict1))
print('Decision Tree Accuracy: ', accuracy_score(y_test, y_predict2))


By Using Function transformation

In [None]:
trf = FunctionTransformer(np.log1p)

In [None]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)
#

In [None]:
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(X_train_trf, y_train)
clf2.fit(X_train_trf, y_train)

y_predict1 = clf.predict(X_test_trf)
y_predict2 = clf2.predict(X_test_trf)

print('Logistic Regression Accuracy: ', accuracy_score(y_test, y_predict1))
print('Decision Tree Accuracy: ', accuracy_score(y_test, y_predict2))

the above result shows that the functiona transfermatin is suitable for the alghorithms like logistic regression and not much effective like the decision tree etc

In [None]:
X_transformed = trf.fit_transform(X_train) # Use X_train instead of X

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

print(cross_val_score(clf, X_transformed, y_train, cv=10, scoring='accuracy').mean())
print(cross_val_score(clf2, X_transformed, y_train, cv=10, scoring='accuracy').mean())

cv=10: Indicates 10-fold cross-validation will be used. The data is split into 10 subsets (or "folds"), and the model is trained and evaluated 10 times, each time using a different fold as the validation set and the rest as the training set.

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
stats.probplot(X_train['Fare'], dist="norm", plot=plt)
plt.title('Fare Before Log')

plt.subplot(122)
stats.probplot(X_train_trf['Fare'], dist="norm", plot=plt)
plt.title('Fare After Log')

plt.show()

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
stats.probplot(X_train['Age'], dist="norm", plot=plt)
plt.title('Age Before Log')

plt.subplot(122)
stats.probplot(X_train_trf['Age'], dist="norm", plot=plt)
plt.title('Age After Log')

plt.show()

The abve transformation is not well suiting the age coloum which shown in above graph

So we are going to transform only the Fare coloumn and check the accuracy

In [None]:
trf2 = ColumnTransformer([('log',FunctionTransformer(np.log1p),['Fare'])],remainder='passthrough')

X_train_transformed2 = trf2.fit_transform(X_train)
X_test_transformed2 = trf2.transform(X_test)

In [None]:
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf1.fit(X_train_transformed2, y_train)
clf2.fit(X_train_transformed2, y_train)

y_predict1 = clf1.predict(X_test_transformed2)
y_predict2 = clf2.predict(X_test_transformed2)

print('Logistic Regression Accuracy: ', accuracy_score(y_test, y_predict1))

print(cross_val_score(clf1, X_train_transformed2, y_train, cv=10, scoring='accuracy').mean())
print(cross_val_score(clf2, X_train_transformed2, y_train, cv=10, scoring='accuracy').mean())

In [None]:
def apply_transform(transform):
  X=df.iloc[:,0:2]
  y=df.iloc[:,2]

  trf = ColumnTransformer([('log',transform,['Fare'])],remainder='passthrough')
  X_transformed = trf.fit_transform(X)
  clf = LogisticRegression()
  accuracy = cross_val_score(clf, X_transformed, y, cv=10, scoring='accuracy').mean()

  plt.figure(figsize=(14,4))

  plt.subplot(121)
  stats.probplot(X['Fare'], dist="norm", plot=plt)
  plt.title('Fare Before Log')

  plt.subplot(122)
  stats.probplot(X_transformed[:,0], dist="norm", plot=plt)
  plt.title('Fare After Log')

  plt.show()

  return accuracy







In [None]:
apply_transform(FunctionTransformer(np.log1p))

In [None]:
apply_transform(FunctionTransformer(np.sqrt))

In [None]:
apply_transform(FunctionTransformer(np.square))

In [None]:
apply_transform(FunctionTransformer(np.cbrt))

customize function

In [None]:
apply_transform(FunctionTransformer(lambda x: x*12)) # Use a lambda expression to define the function