In [None]:
# pip install -U imbalanced-learn
# !pip install lightgbm
# pip install xgboost

In [None]:
# Import EDA Libraries

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.subplots as sp
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

In [None]:
# Import Data Preprocessing and Machine learning libraries

import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

## 1. Dataset loading and Data Exploration

In [None]:
# Load the Training dataset
train_dataset = "C:/Urvesh Koshti/1_Documents/Interview tasks/train.csv"
titanic_train_df = pd.read_csv(train_dataset)

In [None]:
titanic_train_df

In [None]:
titanic_train_df.shape  # Check the shape of the Training Dataset

In [None]:
# Load the Testing dataset
test_dataset = "C:/Urvesh Koshti/1_Documents/Interview tasks/test.csv"
titanic_test_df = pd.read_csv(test_dataset)

In [None]:
titanic_test_df

In [None]:
titanic_test_df.shape  # Check the shape of the Training Dataset

#### Training Dataset Analysis

In [None]:
titanic_train_df.info()

In [None]:
titanic_train_df = titanic_train_df.drop(
    ['PassengerId', 'Ticket'], axis=1
)  # Dropping the PassengerID and Ticket column as they seem unnecessary for the analysis

In [None]:
titanic_train_df.describe()

In [None]:
# Find missing values from each column of the Training dataset
titanic_train_df.isnull().sum()

In [None]:
# Count the number of people who survived and did not survive
survived_count = titanic_train_df['Survived'].value_counts()[1]
not_survived_count = titanic_train_df['Survived'].value_counts()[0]

# Calculate labels and percentages
labels = ['Survived', 'Not Survived']
values = [survived_count, not_survived_count]
percentages = [f"{val:.1f}%" for val in [(v / sum(values)) * 100 for v in values]]

# Create a pie chart with Plotly
fig = go.Figure(data=[go.Pie(labels=labels, values=values, text=percentages, textinfo='percent',
                             hole=.3, marker={'colors': ['lightblue', 'lightsalmon']})])
fig.update_layout(title='Distribution of Survivors on the Titanic')
fig.show()

Figure shows only 38.4% people could survive from the Titanic Ship.

In [None]:
fig = px.violin(titanic_train_df, x='Sex', y='Age', title='Violin Plot of Age by Sex of the Training Dataset', box=True, points="all")

# Customize the layout if needed
fig.update_layout(
    xaxis_title='Sex',
    yaxis_title='Age',
    legend_title='Sex'
)

# Show the plot
fig.show()


From the above figure, it seems that there were more Male people rather than Female people.In total, there were more people with the age between 20 and 35 years.

In [None]:
fig = px.box(titanic_train_df, x='Pclass', y='Age', title='Box Plot of Age by Sex of the Training Dataset')

# Customize the layout if needed
fig.update_layout(
    xaxis_title='Pclass',
    yaxis_title='Age',
    legend_title='Sex'
)

# Show the plot
fig.show()

From the above Figure, it seems that people with the age after 30 years are likely to bought the First class ticket where the young people, people with the age around 20 years are likely to bought third class ticket as they do not want to spend much money on the ticket.

In [None]:
mean_age_by_class_train = titanic_train_df.groupby(by = "Pclass").mean("Age").Age
print(mean_age_by_class_train)

In [None]:
for j in range(1,4):
    age = pd.DataFrame(titanic_train_df[titanic_train_df.Pclass == j].Age).fillna(mean_age_by_class_train[j])
    titanic_train_df.update(age)  # Filling missing values based on the mean value of the respective Ticket Class

In [None]:
titanic_train_df

In [None]:
titanic_train_df.Cabin.unique()

In [None]:
titanic_train_df.Cabin  = titanic_train_df.Cabin.fillna("Undefined")

In [None]:
most_frequent = titanic_train_df.Embarked.mode().values[0]
titanic_train_df.Embarked = titanic_train_df.Embarked.fillna(most_frequent)

In [None]:
titanic_train_df.isnull().sum()

#### Testing Dataset Analysis

In [None]:
# Find missing values from each column of the Test dataset
titanic_test_df.isnull().sum()

In [None]:
# Assuming 'Sex' and 'Age' are the columns in your dataset
fig = px.violin(titanic_test_df, x='Sex', y='Age', title='Violin Plot of Age by Sex of the Testing Dataset', box=True, points="all")

# Customize the layout if needed
fig.update_layout(
    xaxis_title='Sex',
    yaxis_title='Age',
    legend_title='Sex'
)

# Show the plot
fig.show()

In [None]:
fig = px.box(titanic_test_df, x='Pclass', y='Age', title='Box Plot of Age by PClass of the Testing Dataset')

# Customize the layout if needed
fig.update_layout(
    xaxis_title='Pclass',
    yaxis_title='Age',
    legend_title='Sex'
)

# Show the plot
fig.show()

In [None]:
mean_age_by_class_test = titanic_test_df.groupby(by = "Pclass").mean("Age").Age
print(mean_age_by_class_test)

In [None]:
for j in range(1,4):
    age = pd.DataFrame(titanic_test_df[titanic_test_df.Pclass == j].Age).fillna(mean_age_by_class_test[j])
    titanic_test_df.update(age)

In [None]:
titanic_test_df.isnull().sum()

In [None]:
titanic_test_df.Cabin.unique()

In [None]:
titanic_test_df.Cabin = titanic_test_df.Cabin.fillna("Undefined")

In [None]:
titanic_test_df

## Exploratory Data Analysis (EDA) on the Training Dataset (train.csv)

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='Age', title='Age Distribution of Survived Individuals',
                   labels={'Age': 'Age'},
                   nbins=30,  # Adjust the number of bins for better granularity
                   color_discrete_sequence=['green'],  # Color for survived individuals
                   marginal='rug'  # Display individual data points on the sides
                   )

# Customize the layout
fig.update_layout(
    xaxis_title='Age of Survived people',
    yaxis_title='Count of Survived people',
    legend_title='Survived',  # Legend title
    legend=dict(traceorder='normal'),  # Display the legend
    # template='plotly_dark',  # Use a dark template for a more aesthetic look
    bargap=0.1
)

# Show the plot
fig.show()


Above figure shows that there is less likely of having a chance of Survival of the people above 40 years of age. However, figure depicts that there are more number of survivals whose age lies between 20 and 40. One of the reason could be: there are more young people compared to Old people intotal in the Titanic Ship.

In [None]:
fig = px.histogram(titanic_train_df, x='Sex', barmode='group',
                   color='Survived', labels={'Survived': 'Survival Status'},
                  color_discrete_sequence = px.colors.qualitative.Pastel)

fig.update_layout(title='Titanic Survivors by Sex',
                  xaxis_title='Sex',
                  yaxis_title='Count',
                 bargap=0.1)

fig.show()

Above figure shows that there more female survivers compared to male. 

In [None]:
df_women = titanic_train_df.loc[titanic_train_df.Sex == 'female']["Survived"]
survival_rate_women = sum(df_women)/len(df_women) * 100

print(f"Percentage of females who survived: {survival_rate_women:.2f}%")

df_men = titanic_train_df.loc[titanic_train_df.Sex == 'male']["Survived"]
survival_rate_men = sum(df_men)/len(df_men) * 100

print(f"Percentage of males who survived: {survival_rate_men:.2f}%")

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='Pclass', barmode='group',
                   color='Survived', labels={'Survived': 'Survival Status according to Pclass'},
                  color_discrete_sequence = px.colors.qualitative.Pastel)

fig.update_layout(title='Titanic Survivors by Pclass',
                  xaxis_title='PClass',
                  yaxis_title='Count',
                 bargap=0.1)

fig.show()

Above figure shows that highest number of people survived who had 1st Class Ticket. On the other side, people having 2nd class tickets had a highest possibility of non survival.

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='SibSp', barmode='group',
                   color='Survived', labels={'Survived': 'Survival Status according to Siblings and Spouse'},
                  color_discrete_sequence = px.colors.qualitative.Pastel)

fig.update_layout(title='Titanic Survivors by Siblings/Spouses',
                  xaxis_title='Number of Siblings and Spouse',
                  yaxis_title='Count of the Survivors',
                 bargap=0.1)

fig.show()

Above figure shows the scenario of survival possibility of the people having siblings or spouse. Figure depicts that people having 0 Siblings or Spouse along with them have a high possibility of Survival. This possibility seems going down as the people having siblings or spouse increases.

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='Parch', barmode='group',
                   color='Survived', labels={'Survived': 'Survival Status'},
                  color_discrete_sequence = px.colors.qualitative.Pastel)

fig.update_layout(title='Titanic Survivors by Number of Parents or Children aboard on the Titanic',
                  xaxis_title='Number of Parents and children',
                  yaxis_title='Count of the Survivors',
                 bargap=0.1)

fig.show()

In above figure as well, the same scenario can be seen as it is in case of number of siblings or spouse aboard in the Ship. People having no Parents or Children along with them had highest possibility of surviving. However, survival possibility is getting low as number of children or parents increases. People having in total 4 (childred + parents) had a 0% possibilty of surviving.

In [None]:
fig = px.box(titanic_train_df, x='Survived', y='Fare', title='Fare Distribution by Survival Status based on the Fare',
             labels={'Survived': 'Survival'},
             color='Survived',  # Color by survival status for better visualization
             color_discrete_map={0: 'red', 1: 'green'})

# Customize the layout
fig.update_layout(
    xaxis_title='Survival Status',
    yaxis_title='Fare',
    legend_title='Survived',
    # template='plotly_dark'  # Use a dark template for a more aesthetic look
)

# Show the plot
fig.show()


From the above figure, it seems that when the fare cost goes up, there is only a small increase in the chance of surviving.

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='Embarked', barmode='group',
                   color='Survived', labels={'Survived': 'Survival Status according to Pclass'},
                  color_discrete_sequence = px.colors.qualitative.Pastel)

fig.update_layout(title='Titanic Survivors by Port of Embarkation',
                  xaxis_title='Embarked',
                  yaxis_title='Count of Survived passengers',
                 bargap=0.1)

fig.show()

C = Cherbourg, Q = Queenstown, S = Southampton


From the above figure, it can be seen that people who boarded the Titanic ship from Southampton, England, had the highest chance of surviving. Those who boarded from Cherbourg, France, and Queenstown (Cobh), Ireland, had lower chances of survival.

In [None]:
# From the dataframe's 'Name' column, extract the Prefix of the Name and make it as a New column 
titanic_train_df['title_name'] = titanic_train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

titanic_test_df['title_name'] = titanic_test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
titanic_train_df

In [None]:
titanic_test_df

In [None]:
titanic_train_df['title_name'].unique()

In [None]:
titanic_test_df['title_name'].unique()

In [None]:
def categorize_titles(title):
    if title in ['Mr', 'Mrs', 'Miss', 'Master']:
        return title
    else:
        return 'Other titles'

# Applying the function to df_train title_name column
titanic_train_df['title_name'] = titanic_train_df['title_name'].apply(categorize_titles)

titanic_test_df['title_name'] = titanic_test_df['title_name'].apply(categorize_titles)

In [None]:
# Removing the 'Name' column as full names aren't needed for building the model
titanic_train_df = titanic_train_df.drop('Name', axis=1)

titanic_test_df = titanic_test_df.drop('Name', axis=1)

In [None]:
titanic_train_df

In [None]:
# Adding two columns 'SibSp' and 'Parch' to make the one column which represets one family Size that inclues the children, spouse, parents and the person itself. 
titanic_train_df['family_size'] = titanic_train_df['SibSp'] + titanic_train_df['Parch']

titanic_test_df['family_size'] = titanic_test_df['SibSp'] + titanic_test_df['Parch']

In [None]:
titanic_train_df

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='family_size', y='Survived',
                   title='Survival Rate by Family Size',
                   labels={'Survived': 'Survival Rate'},
                   template='plotly_dark', text_auto=True,
                   color_discrete_sequence=['#10c2de']
                  )

fig.update_layout(title='Titanic Survivors by the size of the family (Children + Spouse + Parents + Person itself)',
                  xaxis_title='Family Size',
                  yaxis_title='Count of Survived passengers',
                 bargap=0.1)

fig.show()

- Smaller families, consisting of three members or fewer, were more likely to survive, while those with four or more members had a lower probability of making it through the incident.

- The data suggests that, overall, smaller families were safer during the event. Moreover, individuals traveling alone had the highest chances of survival.

In [None]:
fig = px.histogram(titanic_train_df[titanic_train_df['Survived'] == 1], x='title_name', y='Survived',
                   title='Survival Rate by a Prefix of the person name',
                   labels={'Survived': 'Survival Rate'},
                   template='plotly_dark', text_auto=True,
                   color_discrete_sequence=['#10c2de']
                  )

fig.update_layout(xaxis_title='Title (Prefix) of the passenger',
                  yaxis_title='Count of Survived passengers',
                 bargap=0.1)

fig.show()

The above figure shows that the women with titles "Miss" and "Mrs." survived the most. This shows that women were the top priority for survival during the incident compared to Men people.

In [None]:
titanic_train_df

In [None]:
titanic_train_df.isnull().sum()

In [None]:
titanic_test_df.isnull().sum()

- Training Dataset does not have any missing values now since they are already handeled at the starting part of the work.

- Testing Dataset has one missing value in the column called 'Fare' which is handeled as below.

In [None]:
# Fill missing values with the mean value of all values available in the 'Fare' column
titanic_test_df['Fare'].fillna(titanic_test_df['Fare'].mean(), inplace=True)

In [None]:
titanic_test_df.isnull().sum()

- In the Training Dataset and Testing Dataset, There are three columns 'Sex', 'title_name', 'Embarked' that has Categorical values. To consider this columns in Machine Learning Model, there is a need to represent these Categorical values in a numerical form. 

- Therefore, to convert the categorical values into Numerical values, a concept called 'One Hot Encoding' is applied as below.

In [None]:
# Use the get_dummies() function for one-hot encoding
titanic_train_df_encoded = pd.get_dummies(titanic_train_df, columns=['Sex', 'title_name', 'Embarked'])

# Display the DataFrame with one-hot encoding
titanic_train_df_encoded.head()

In [None]:
titanic_train_df_encoded = titanic_train_df_encoded.drop(
    ['Cabin'], axis=1
)  # Removing further unnecessary column from the dataset

In [None]:
# Use the get_dummies() function for one-hot encoding
titanic_test_df_encoded = pd.get_dummies(titanic_test_df, columns=['Sex', 'title_name', 'Embarked'])

# Display the DataFrame with one-hot encoding
titanic_test_df_encoded.head()

In [None]:
titanic_test_df_encoded = titanic_test_df_encoded.drop(
    ['Cabin'], axis=1
)  # Removing further unnecessary column from the dataset

In [None]:
# Extract the x Featues and y Targets
X = titanic_train_df_encoded.drop('Survived',axis=1)
y = titanic_train_df_encoded['Survived']

In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=42,
                                                    stratify=y)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
X_train

- In the Training dataset, a Data Imbalance is seen. Data imbalance refers to a situation in a dataset where the distribution of instances across different classes is not equal or balanced. In a classification problem, where the goal is to categorize instances into two or more classes, data imbalance occurs when one class has significantly more or fewer instances than another class.

- In order to overcome the Data Imbalance, Resampling technique can be implemented. Resampling means either oversampling the minority class or undersampling the majority class to create a more balanced dataset. This can be done by Creating synthetic instances for the minority class using techniques like SMOTE (Synthetic Minority Over-sampling Technique) as below.

In [None]:
# Apply SMOTE to oversample the minority class in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the count of each class before and after resampling
print("Class distribution before Resampling:")
print(y_train.value_counts())

print("\nClass distribution after Resampling:")
print(pd.Series(y_train_resampled).value_counts())

Now in the Training dataset, there are equal scenario of Survived as '0' and Survived as '1'. The dataset is balanced and hence, it will prevent the Machine Learning (ML) model being a Bias towards the Majority samples. 

In [None]:
# Applying the Standardization on the Dataset to make features have similar scales and follow a standard normal distribution. 

numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'family_size']

# Creating a RobustScaler instance
scaler = RobustScaler()

# Fitting the RobustScaler on the training data
scaler.fit(X_train[numerical_features])

# Transforming (scaling) the continuous features in the training and testing data
X_train_cont_scaled = scaler.transform(X_train[numerical_features])
X_test_cont_scaled = scaler.transform(X_test[numerical_features])

# Replacing the scaled continuous features in the original data
X_train[numerical_features] = X_train_cont_scaled
X_test[numerical_features] = X_test_cont_scaled

X_train

In [None]:
# Calculate the correlation values with the target variable
correlation_with_target = titanic_train_df_encoded.corr()['Survived'].drop('Survived')

# Create a grouped bar chart for visualization
fig = px.bar(x=correlation_with_target.index, y=correlation_with_target.values,
             labels={'x': 'Features', 'y': 'Correlation with Target'},
             title='Correlation with Target for each Feature',
             color=correlation_with_target.values > 0,  # Color by positive/negative correlation
             color_discrete_sequence=['red', 'green'],  # Red for negative, green for positive
             )

# Customize the layout
fig.update_layout(template='plotly_dark')

# Show the plot
fig.show()

From above plot, it can be seen that Features 'Age', 'SibSp', 'family_size' and 'title_name_Other_titles' do not contribute much and have very less correlation with the target variable. Hence, these features can be removed from the datset.

In [None]:
# Drop less correlated features from the Dataset
X_train_mostrelated_features = X_train.drop(['Age', 'SibSp', 'family_size', 'title_name_Other titles'], axis=1)
X_test_mostrelated_features = X_test.drop(['Age', 'SibSp', 'family_size', 'title_name_Other titles'], axis=1)

## Implementing a Machine Learning model

Before making prediction, it is important to check which ML model will make the better prediction. Therefore, below are different classification Models which can be evaluated on the Dataset. 

In [None]:
# List of classifiers to evaluate
classifiers = [
    ("Logistic Regression", LogisticRegression(random_state=42, max_iter= 1500, n_jobs=-1)),
    ("KNN", KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
    ("Gaussian Naive Bayes", GaussianNB()),
    ("SVC", SVC(random_state=42, probability=True)),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42, n_jobs =-1)),
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("LightGBM", lgb.LGBMClassifier(random_state=42, verbose=-1)),
    ("XGBoost", xgb.XGBClassifier(random_state=42, n_jobs =-1))
]

In [None]:
# Creating lists for classifier names, mean_test_f1_scores, cross_val_errors, mean_test_accuracies, and results.
results = []
mean_test_f1_scores = []
mean_test_accuracies = []
cross_val_errors = []
classifier_names = []

# Applying cross-validation helps us thoroughly test machine learning models. 
# It checks their performance across various datasets, ensuring a strong evaluation. 
# This method involves testing features on different data parts, guaranteeing they work well across different situations. 

for model_name, model in classifiers:
    
    # 5-fold Stratified Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform cross-validation with train and test scores
    cv_results = cross_validate(model, X_train_mostrelated_features, y_train, cv=cv, scoring=['f1', 'accuracy'], n_jobs=-1, return_train_score=True)

    # Calculate cross-validation error
    cross_val_error = 1 - np.mean(cv_results['test_accuracy'])

    # Append results to the list
    results.append({
        "Model Name": model_name,
        "Mean Train F1 Score": np.mean(cv_results['train_f1']),
        "Mean Test F1 Score": np.mean(cv_results['test_f1']),
        "Mean Test Accuracy": np.mean(cv_results['test_accuracy']),
        "Cross-Validation Error": cross_val_error
    })
    
    mean_test_f1_scores.append(np.mean(cv_results['test_f1']))
    mean_test_accuracies.append(np.mean(cv_results['test_accuracy']))
    cross_val_errors.append(cross_val_error)
    classifier_names.append(model_name)

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Display the DataFrame
display(results_df)

In [None]:
model_names = results_df['Model Name'].tolist()
mean_test_accuracy = results_df['Mean Test Accuracy'].tolist()
cross_validation_error = results_df['Cross-Validation Error'].tolist()

# Creating the stacked bar chart
trace1 = go.Bar(
    x=model_names,
    y=mean_test_accuracy,
    name='Mean Test Accuracy',
    marker_color='royalblue'
)

trace2 = go.Bar(
    x=model_names,
    y=cross_validation_error,
    name='Cross-Validation Error',
    marker_color='coral'
)

fig = go.Figure(data=[trace1, trace2])

# Setting the layout of the chart
fig.update_layout(
    barmode='stack',  # Stack the bars on top of each other
    xaxis_title='Model Name',
    yaxis_title='Score',
    title='Mean Test Accuracy and Cross-Validation Error',
    # xaxis_tick_angle=45  # Rotate x-axis labels for better readability
)

fig.show()

From above figure, it can be seen that among all classifiers, LightGBM classifier exhibits the highest Accuracy and lowest Cross Validation Error. Therefore, LightGBM seems to be the Model for the further analysis.

#### Hyperparameter Tuning with the Grid Search

In [None]:
# Define the LightGBM classifier
lgb_classifier = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42)

# Define the hyperparameters to tune and their possible values
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(estimator=lgb_classifier, param_grid=param_grid, scoring='accuracy', cv=3)

# Fit the model to the training data
grid_search.fit(X_train_mostrelated_features, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_lgb_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_lgb_model.predict(X_test_mostrelated_features)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)

In [None]:
best_lgb_model

#### Prediction on the Testing Dataset (test.csv) using LightGBM as a Machine Learning Model 

In [None]:
titanic_test_df_encoded

In [None]:
# Creating a New dataframe for saving a prediction result
result_dataframe = pd.DataFrame(
    {
     'PassengerId': titanic_test_df_encoded.PassengerId,
     'Survived': ""
    }
)

In [None]:
titanic_test_df_encoded = titanic_test_df_encoded.drop(
    ['PassengerId', 'Ticket', 'Age', 
     'SibSp', 'family_size', 'title_name_Other titles'], axis=1
)  # Removing unnecessary Columns from the Testing dataset

In [None]:
titanic_test_df_encoded

In [None]:
predictions = best_lgb_model.predict(
    titanic_test_df_encoded
)  # Prediction on the Test Dataset

In [None]:
result_dataframe['Survived'] = predictions  # Adding prediction values to the column

In [None]:
result_dataframe

In [None]:
result_dataframe.to_csv(
    'result_dataframe_test_dataset.csv', index=False
)  # Saving the Predicted result in a CSV file
print("Results are successfully saved in csv file!")