<a href="https://colab.research.google.com/github/ShivM99/Python/blob/main/Titanic_DTC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Importing the training dataset
import pandas as pd
train = pd.read_csv (r"titanic_train.csv")

In [5]:
#Seggregating the features and the target for 'titanic_train'
x = train.iloc[:, [2, 4, 5, 6, 7, 9, 11]].values
x_df = pd.DataFrame (x, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
print ("\nFeatures:\n", x_df)


Features:
     Pclass     Sex   Age SibSp Parch     Fare Embarked
0        3    male  22.0     1     0     7.25        S
1        1  female  38.0     1     0  71.2833        C
2        3  female  26.0     0     0    7.925        S
3        1  female  35.0     1     0     53.1        S
4        3    male  35.0     0     0     8.05        S
..     ...     ...   ...   ...   ...      ...      ...
886      2    male  27.0     0     0     13.0        S
887      1  female  19.0     0     0     30.0        S
888      3  female   NaN     1     2    23.45        S
889      1    male  26.0     0     0     30.0        C
890      3    male  32.0     0     0     7.75        Q

[891 rows x 7 columns]


In [6]:
y = train.iloc[:, 1].values
y_df = pd.DataFrame (y, columns = ['Survived'])
print ("\nTarget:\n", y_df)


Target:
      Survived
0           0
1           1
2           1
3           1
4           0
..        ...
886         0
887         1
888         0
889         1
890         0

[891 rows x 1 columns]


In [7]:
#Handling the missing values
#One hot encoding the categorical features
print ("\nMissing values in different features:\n", x_df.isnull().sum())
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


Missing values in different features:
 Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


In [8]:
#ColumnTransformer (transformers = [("name_of_object", object_creation, [columns]), ("name_of_object", object_creation, [columns]), ...], remainder = "passthrough")
ct = ColumnTransformer (transformers = [("si_mode", SimpleImputer (missing_values = np.nan, strategy = "most_frequent"), [2]), ("si_mean", SimpleImputer (missing_values = np.nan, strategy = "mean"), [-2]),  ("encoder", OneHotEncoder(), [0, 1])], remainder = "passthrough")
x = ct.fit_transform (x)

In [9]:
#Pipeline (steps = [("name_of_object", object_creation), ("name_of_object", object_creation), ...])
pipe = Pipeline (steps = [("si_mode", SimpleImputer (missing_values = np.nan, strategy = "most_frequent")), ("encoder", OneHotEncoder())])
embarked = ColumnTransformer (transformers = [("pipe", pipe, [-1])], remainder = "passthrough")
x = embarked.fit_transform (x)
x_df_new = pd.DataFrame (x)
print ("\nPreprocessed features:\n", x_df_new)


Preprocessed features:
       0    1    2     3        4    5    6    7    8    9  10 11
0    0.0  0.0  1.0  22.0     7.25  0.0  0.0  1.0  0.0  1.0  1  0
1    1.0  0.0  0.0  38.0  71.2833  1.0  0.0  0.0  1.0  0.0  1  0
2    0.0  0.0  1.0  26.0    7.925  0.0  0.0  1.0  1.0  0.0  0  0
3    0.0  0.0  1.0  35.0     53.1  1.0  0.0  0.0  1.0  0.0  1  0
4    0.0  0.0  1.0  35.0     8.05  0.0  0.0  1.0  0.0  1.0  0  0
..   ...  ...  ...   ...      ...  ...  ...  ...  ...  ... .. ..
886  0.0  0.0  1.0  27.0     13.0  0.0  1.0  0.0  0.0  1.0  0  0
887  0.0  0.0  1.0  19.0     30.0  1.0  0.0  0.0  1.0  0.0  0  0
888  0.0  0.0  1.0  24.0    23.45  0.0  0.0  1.0  1.0  0.0  1  2
889  1.0  0.0  0.0  26.0     30.0  1.0  0.0  0.0  0.0  1.0  0  0
890  0.0  1.0  0.0  32.0     7.75  0.0  0.0  1.0  0.0  1.0  0  0

[891 rows x 12 columns]


In [10]:
#Splitting the dataset into training & test datasets not done as the dataset is already splitted
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2, random_state = 0)

In [11]:
#Building the DTC model
from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC (criterion = "entropy", random_state = 0)
dtc.fit (x_train, y_train)
y_pred = dtc.predict (x_test)

In [12]:
#Checking the accuracy of DTC model
from sklearn.metrics import accuracy_score
print ("\nAccuracy of the DTC model:", accuracy_score (y_test, y_pred)*100)


Accuracy of the DTC model: 78.2122905027933


In [13]:
#Cross-validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = dtc, X = x_train, y = y_train, cv = 10)
print ("\nMean accuracy:", accuracies.mean()*100)


Mean accuracy: 76.40062597809077


In [14]:
#Grid search
from sklearn.model_selection import GridSearchCV
h_parameters = [{"criterion": ["gini", "entropy"], "splitter": ["best", "random"], "max_depth": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "max_features": ["auto", "sqrt", "log2", "None"], "max_leaf_nodes": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}]
grid = GridSearchCV (estimator = dtc, param_grid = h_parameters, scoring = "accuracy", n_jobs = -1, cv = 10)
grid.fit (x_train, y_train)
best_hparameters = grid.best_params_
best_accuracy = grid.best_score_
print ("\nBest hyper-parameters:\n", best_hparameters)
print ("\nBest accuracy:", best_accuracy*100)

400000 fits failed out of a total of 1600000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
199702 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.9/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.9/dist-packages/sk


Best hyper-parameters:
 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': 45, 'min_samples_leaf': 2, 'min_samples_split': 6, 'splitter': 'random'}

Best accuracy: 81.47104851330204


In [15]:
#Importing the testing dataset
test = pd.read_csv (r"titanic_test.csv")
x_new = test.iloc[:, [1, 3, 4, 5, 6, 8, 10]].values
x_df = pd.DataFrame (x_new, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
print ("\nFeatures:\n", x_df)


Features:
     Pclass     Sex   Age SibSp Parch     Fare Embarked
0        3    male  34.5     0     0   7.8292        Q
1        3  female  47.0     1     0      7.0        S
2        2    male  62.0     0     0   9.6875        Q
3        3    male  27.0     0     0   8.6625        S
4        3  female  22.0     1     1  12.2875        S
..     ...     ...   ...   ...   ...      ...      ...
413      3    male   NaN     0     0     8.05        S
414      1  female  39.0     0     0    108.9        C
415      3    male  38.5     0     0     7.25        S
416      3    male   NaN     0     0     8.05        S
417      3    male   NaN     1     1  22.3583        C

[418 rows x 7 columns]


In [16]:
print ("\nMissing values in different features:\n", x_df.isnull().sum())


Missing values in different features:
 Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [17]:
#Handling the missing values
#One hot encoding the categorical features
x_new = ct.transform (x_new)

In [18]:
x_new = embarked.transform (x_new)
x_df_new = pd.DataFrame (x_new)
print ("\nPreprocessed features:\n", x_df_new)


Preprocessed features:
       0    1    2     3        4    5    6    7    8    9  10 11
0    0.0  1.0  0.0  34.5   7.8292  0.0  0.0  1.0  0.0  1.0  0  0
1    0.0  0.0  1.0  47.0      7.0  0.0  0.0  1.0  1.0  0.0  1  0
2    0.0  1.0  0.0  62.0   9.6875  0.0  1.0  0.0  0.0  1.0  0  0
3    0.0  0.0  1.0  27.0   8.6625  0.0  0.0  1.0  0.0  1.0  0  0
4    0.0  0.0  1.0  22.0  12.2875  0.0  0.0  1.0  1.0  0.0  1  1
..   ...  ...  ...   ...      ...  ...  ...  ...  ...  ... .. ..
413  0.0  0.0  1.0  24.0     8.05  0.0  0.0  1.0  0.0  1.0  0  0
414  1.0  0.0  0.0  39.0    108.9  1.0  0.0  0.0  1.0  0.0  0  0
415  0.0  0.0  1.0  38.5     7.25  0.0  0.0  1.0  0.0  1.0  0  0
416  0.0  0.0  1.0  24.0     8.05  0.0  0.0  1.0  0.0  1.0  0  0
417  1.0  0.0  0.0  24.0  22.3583  0.0  0.0  1.0  0.0  1.0  1  1

[418 rows x 12 columns]


In [19]:
lr = DTC (criterion = 'entropy', max_depth = 8, max_features = 'auto', max_leaf_nodes = 45, min_samples_leaf = 2, min_samples_split = 6, splitter = 'random', random_state = 0)
lr.fit (x, y)
y_pred = lr.predict (x_new)
y_pred



array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,