### Objective:
The objective of this assignment is to compare the performance of Light GBM and XG Boost algorithms using the Titanic dataset. 


In [1]:
### import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import lightgbm
from xgboost import XGBClassifier
import warnings
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV

ModuleNotFoundError: No module named 'lightgbm'

### 1. Import and Read Data

In [None]:
train_df = pd.read_csv("Titanic_train.csv")
test_df = pd.read_csv('Titanic_test.csv')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
test_df.head()

In [None]:
test_df.shape

### 2. EDA

##### Data unerstanding

In [None]:
train_df.info()

In [None]:
train_df.describe(include='all')

##### Data Visualization

In [None]:
### plot heatmap to check co-relation
sns.heatmap(train_df.select_dtypes('number').corr(),annot=True)

In [None]:
sns.histplot(data = train_df,x = 'Age',color='red',hue='Survived')

In [None]:
sns.barplot(train_df,x = 'Survived',y = 'Fare',hue = 'Sex')
plt.show()

### 3. Data Preprocessing

In [None]:
### check for missing values
train_df.isna().sum()

In [None]:
### drop the columns which are less informative
train_df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin'],axis=1,inplace=True)

In [None]:
train_df.head(2)

In [None]:
### Missing value immputation for age column
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())

In [None]:
train_df['Embarked'].mode()

In [None]:
### Missing value immputation for Embarked column
train_df['Embarked'] = train_df['Embarked'].fillna('S')

In [None]:
train_df.isna().sum() 
##all the missing values has been replaced

In [None]:
train_df.dtypes

In [None]:
train_df['Sex'].unique()

In [None]:
train_df['Embarked'].unique()

In [None]:
### Encode categorical variable
cat_col = train_df[['Sex','Embarked']]

In [None]:
encoder = OneHotEncoder(sparse_output=False)
ohe = encoder.fit_transform(cat_col)

In [None]:
df = pd.DataFrame(ohe,columns=encoder.get_feature_names_out())

In [None]:
df

In [None]:
new_train_df = df.join(train_df)
new_train_df = new_train_df.drop(['Sex','Embarked'],axis=1)

### 4. Model Building

In [None]:
### Divide dataset into  Dependent and in  independent
x = new_train_df.drop(['Survived'],axis=1)
y = new_train_df['Survived']

In [None]:
## split data into training and testing
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 12)

### LGBM

In [None]:
lbg_classifier = lightgbm.LGBMClassifier()

In [None]:
## Train model on LGBM Classifier
model = lbg_classifier.fit(x_train,y_train)

### Model Testing and Evaluation

In [None]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

In [None]:
print("Classification report of Light Gradient Boosting for Training Data:\n\n",classification_report(y_train,y_train_pred))

In [None]:
print("Confusion Matrix of Light Gradient Boosting for Training Data:\n\n",confusion_matrix(y_train,y_train_pred))

In [None]:
print("Classification report of Light Gradient Boosting for Testing Data:\n\n",classification_report(y_test,y_test_pred))

In [None]:
print("Confusion Matrix of Light Gradient Boosting for Training Data:\n\n",confusion_matrix(y_test,y_test_pred))

#### Hyperparameter Tuning

In [None]:
## Define Param Grid
param = {'n_estimators':[50, 100, 200],
         'learning_rate':[0.01, 0.1, 0.2],
         'num_leaves':[31, 50, 100]
}

In [None]:
## create grid search cv  model
gscv = GridSearchCV(estimator=model,param_grid=param,scoring='accuracy',cv=5,n_jobs=-1,verbose=1)
gscv.fit(x_train,y_train)

In [None]:
# Get the best parameters and best score
best_params = gscv.best_params_ 
best_score = gscv.best_score_ 
print("Best Parameters:", best_params) 
print("Best Score:", best_score)

### XGBoost

In [None]:
xgb = XGBClassifier()

In [None]:
xgb.fit(x_train,y_train)

In [None]:
## Define Param Grid
param = {'n_estimators':[50, 100, 200],
         'learning_rate':[0.01, 0.1, 0.2],
         'num_leaves':[31, 50, 100],
         'max_depth':[4,5,6],
}

In [None]:
## create grid search cv  model
grid_search = GridSearchCV(estimator=xgb,param_grid=param,scoring='accuracy',cv=5,n_jobs=-1,verbose=1)
grid_search.fit(x_train,y_train)