# Titanic - Machine Learning from Disaster
### Using CatBoost Classifier model

# Importing Required Libraries


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Training Data

In [None]:
dataset = pd.read_csv('/kaggle/input/titanic/train.csv')
dataset.head()

In [None]:
print(dataset.shape)

print(dataset.columns)

In [None]:
dataset.dtypes.value_counts()

In [None]:
datatypes = dataset.columns.to_series().groupby(dataset.dtypes).groups
datatypes

In [None]:
# Dropping PassengerId and Name from Train Dataset

dataset.drop(['PassengerId','Name'], axis=1, inplace = True)

In [None]:
# Converting Categorical Values

dataset[['Survived', 'Pclass','Sex','Cabin','Ticket','Embarked']] = dataset[['Survived', 'Pclass','Sex','Ticket','Cabin', 'Embarked']].apply(LabelEncoder().fit_transform)

In [None]:
# Check for NAN values

dataset.isna().sum()

In [None]:
# Filling NaN values with median

dataset["Age"].fillna(dataset["Age"].median(),inplace=True)

In [None]:
dataset

In [None]:
dataset.describe().T

# Data Visualisation

In [None]:
corr_matrix = dataset.corr()
fig, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(corr_matrix, annot=True, linewidths=0.5,fmt=".2f", cmap="magma");                                  
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
dataset.drop('Survived', axis=1).corrwith(dataset.Survived).plot(kind='bar', color='Black', figsize=(14, 7), title="Correlation with Survived ")


In [None]:
# Plotting

sns.set_context('notebook')
sns.set_style('white')
sns.pairplot(dataset); 

In [None]:
# Choosing X and y 

X = dataset.drop('Survived', axis=1)
y = dataset.Survived

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# Test Data

In [None]:
dataset_test = pd.read_csv('/kaggle/input/titanic/test.csv')
dataset_test1 = dataset_test.copy()

dataset_test.head()

In [None]:
# Dropping PassengerId and Name from Test Dataset

dataset_test.drop(['PassengerId','Name'], axis=1, inplace = True)

In [None]:
# Converting Categorical Values

dataset_test[['Pclass','Sex', 'Cabin','Ticket','Embarked']] = dataset_test[['Pclass','Sex', 'Cabin','Ticket','Embarked']].apply(LabelEncoder().fit_transform)

In [None]:
# Check for NAN values

dataset_test.isna().sum()

In [None]:
# Filling NaN values with median

dataset_test["Age"].fillna(dataset_test["Age"].median(),inplace=True)

# Machine Learning

## CatBoost Classifier model

In [None]:
cat = CatBoostClassifier(iterations=100, depth=8, learning_rate=0.1,loss_function='Logloss', custom_metric=['AUC'], random_seed=42)

In [None]:
cat.fit(X, y)

In [None]:
pred_t = cat.predict(X_test)

# Evaluation

### R-Squared Error

In [None]:
print("R^2 on training  data ",cat.score(X_train, y_train))
print("R^2 on testing data ",cat.score(X_test,y_test))

### RMSE

In [None]:
mse = mean_squared_error(y_test, pred_t)
rmse = np.sqrt(mse)
rmse


# Prediction

In [None]:
pred_cat = cat.predict(dataset_test)

In [None]:
pred = pd.DataFrame(pred_cat, columns=['Survived'])
output = pd.concat([dataset_test1['PassengerId'],pred],axis=1).set_index(['PassengerId'])
output.to_csv('submission.csv')

In [None]:
output