### 1. Import Libraries & Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df0 = pd.read_csv(r"C:/Users/monstr/Desktop/Machine_Learning_Algorithms/C_Datasets/titanic_data_train.csv")
df = df0.copy()
df.head()

Unnamed: 0,Sex,Pclass,SibSp,Parch,Alone,Embarked,Age,Ticket,Fare,Survived
0,1,3,1,0,1,2,22.0,21171,7.25,0
1,0,1,1,0,1,0,38.0,17599,71.2833,1
2,0,3,0,0,0,2,26.0,3101282,7.925,1
3,0,1,1,0,1,2,35.0,113803,53.1,1
4,1,3,0,0,0,2,35.0,373450,8.05,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex       891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   SibSp     891 non-null    int64  
 3   Parch     891 non-null    int64  
 4   Alone     891 non-null    int64  
 5   Embarked  891 non-null    int64  
 6   Age       891 non-null    float64
 7   Ticket    891 non-null    int64  
 8   Fare      891 non-null    float64
 9   Survived  891 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 69.7 KB


In [4]:
df_clf = df.drop(columns=["SibSp", "Parch", "Embarked"])
df_clf.head()

Unnamed: 0,Sex,Pclass,Alone,Age,Ticket,Fare,Survived
0,1,3,1,22.0,21171,7.25,0
1,0,1,1,38.0,17599,71.2833,1
2,0,3,0,26.0,3101282,7.925,1
3,0,1,1,35.0,113803,53.1,1
4,1,3,0,35.0,373450,8.05,0


### 2. Standardization

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [6]:
df_clf.loc[:, "Age"] = scaler.fit_transform(df_clf.iloc[:, 3:-1])[:, 0]
df_clf.loc[:, "Ticket"] = scaler.fit_transform(df_clf.iloc[:, 3:-1])[:, 1]
df_clf.loc[:, "Fare"] = scaler.fit_transform(df_clf.iloc[:, 3:-1])[:, 2]

In [7]:
df_clf.head()

Unnamed: 0,Sex,Pclass,Alone,Age,Ticket,Fare,Survived
0,1,3,1,-0.551366,-0.423103,-0.502445,0
1,0,1,1,0.65403,-0.428495,0.786845,1
2,0,3,0,-0.250017,4.226398,-0.488854,1
3,0,1,1,0.428018,-0.283273,0.42073,1
4,1,3,0,0.428018,0.10867,-0.486337,0


### 3. Split Data Frame to Train & Test Set

In [8]:
from sklearn.model_selection import train_test_split

X = df_clf.drop(columns=["Survived"]).values
y = df_clf.loc[:, "Survived"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 4. Create Model

In [9]:
from sklearn.tree import DecisionTreeClassifier

tre_clf_model = DecisionTreeClassifier().fit(X_train, y_train)

In [10]:
y_pred = tre_clf_model.predict(X_test)

### 5. Evaluation

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [12]:
pd.DataFrame({"Model": "Decision Tree Classifier",
              "Accuracy": accuracy,
              "Precision": precision,
              "Recall": recall,
              "F1-Score": f1},
            index = [0])

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression Classifier,0.731844,0.656716,0.637681,0.647059


### 6. Optimization

In [14]:
from sklearn.model_selection import GridSearchCV

parameters = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth':[None, '2', '5', '10', '15', '20', '25', '30'],
              'ccp_alpha': [0.0, 0.001, 0.01, 0.1, 1, 10, 100],
              'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
              'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'max_leaf_nodes': [None, 1, 2, 3, 4, 5, 6, 7, 8]}

tree_clf_model_grid = GridSearchCV(DecisionTreeClassifier(), parameters)

In [15]:
tree_clf_model_grid.fit(X_train, y_train)

604800 fits failed out of a total of 680400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9450 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\monstr\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\monstr\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\monstr\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\monstr\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise In

In [16]:
tree_clf_model_grid.best_params_

{'ccp_alpha': 0.0,
 'criterion': 'gini',
 'max_depth': None,
 'max_leaf_nodes': 8,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [18]:
tree_clf_model_grid = DecisionTreeClassifier(ccp_alpha=0.0, criterion='gini', max_depth=None,
                                       max_leaf_nodes=8, min_samples_leaf=3,
                                       min_samples_split=2).fit(X_train, y_train)
y_pred = tree_clf_model_grid.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

pd.DataFrame({"Model": "Decision Tree Classifier",
              "Accuracy": accuracy,
              "Precision": precision,
              "Recall": recall,
              "F1-Score": f1},
            index = [0])

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Decision Tree Classifier,0.77095,0.733333,0.637681,0.682171
