## Importing libraries

In [None]:
!pip install catboost
import numpy as np 
import seaborn as sns
import pandas as pd
import catboost as cb 
import lightgbm as lgbm

from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount("/content/gdrive")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Import Datasets

In [None]:
test_path = '/content/gdrive/My Drive/Titanic/data/test.csv'
train_path  = '/content/gdrive/My Drive/Titanic/data/train.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data cleaning

In [None]:
def cleanData(data):
    
    # Delete unnecessary data or little filled
    data.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)

    # Grouping and padding age
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    # Grouping and filling in the ticket price
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

    # Removing Embarked Values
    data.dropna(axis=0, subset=['Embarked'], inplace=True)
    
    # Sex
    data['Sex'].replace({'male':0, 'female':1}, inplace=True)
    
    # Embarked
    data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return data

In [None]:
clean_train = cleanData(train)
clean_test = cleanData(test)


In [None]:
# Checking if there are no missing values
clean_train.info()
clean_test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    int64  
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 69.5 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          418 non-null    float64
 4   SibSp        418 non-n

## Modeling, improving models with GridSearchCV

In [None]:
# Set X and y
y = train['Survived']
X = pd.get_dummies(train.drop('Survived', axis=1))

# Split model train test data
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=42)

 ### RandomForestClassifier

In [None]:
parameters = { 'n_estimators': [5, 10, 15, 20, 25],
              'max_depth': [3, 5, 7, 9, 11, 13],
              'min_samples_leaf': range (1,8),
              'min_samples_split': range (2,10,2) }
model1 = RandomForestClassifier()              
model1 = GridSearchCV(model1, parameters, cv=5, scoring='accuracy',)
model1.fit(X_train, y_train)
print(f'Best parameters {model1.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model1.best_score_:.3f}'
)            

Best parameters {'max_depth': 13, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 25}
Mean cross-validated accuracy score of the best_estimator: 0.842


### GradientBoostingClassifier

In [None]:
parameters = {'learning_rate': [0.01,0.02,0.03],
                  'subsample'    : [0.9, 0.5, 0.2],
                  'n_estimators' : [100,500,1000],
                  'max_depth'    : [4,6,8] 
                 }
model2 = cb.CatBoostClassifier(verbose=False)              
model2 = GridSearchCV(model2, parameters, cv=5,scoring='accuracy')
model2.fit(X_train, y_train)
print(f'Best parameters {model2.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model2.best_score_:.3f}'
)

Best parameters {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1000, 'subsample': 0.5}
Mean cross-validated accuracy score of the best_estimator: 0.832


### CatBoostClassifier

In [None]:
parameters = {
    'iterations': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 9, 11, 13],
}
model3 = cb.CatBoostClassifier(verbose=False)              
model3 = GridSearchCV(model3, parameters, cv=5,scoring='accuracy')
model3.fit(X_train, y_train)
print(f'Best parameters {model3.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model3.best_score_:.3f}'
)

Best parameters {'depth': 11, 'iterations': 100, 'learning_rate': 0.1}
Mean cross-validated accuracy score of the best_estimator: 0.829


### LGBMClassifier

In [None]:
parameters = {
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [7, 15, 31],
}
model4 = lgbm.LGBMClassifier()              
model4 = GridSearchCV(model4, parameters, cv=5, scoring='accuracy')
model4.fit(X_train, y_train)
print(f'Best parameters {model4.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model4.best_score_:.3f}'
)

Best parameters {'learning_rate': 0.1, 'n_estimators': 20, 'num_leaves': 7}
Mean cross-validated accuracy score of the best_estimator: 0.820


In [None]:
# Function for Fitting and Predicting Models
def fitAndPredict(model):
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    return accuracy_score(y_val, prediction)

In [None]:
model1 = RandomForestClassifier(max_depth= 13, min_samples_leaf= 2,
                                min_samples_split=8, n_estimators=25)
model1.fit(X_train, y_train)

prediction1 = model1.predict(X_val)

model2 = GradientBoostingClassifier()

model2.fit(X_train, y_train)

prediction2 = model2.predict(X_val)

model3 = CatBoostClassifier(verbose=False)

model3.fit(X_train, y_train)

prediction3 = model3.predict(X_val)

model4 = lgbm.LGBMClassifier(learning_rate=0.1, n_estimators=20, num_leaves=7)
model4.fit(X_train, y_train)

prediction4 = model4.predict(X_val)

models = [model1, model2, model3, model4]

i = 0
for model in models:   
    i +=1
    print("Model ", i,":", model)
    print("ACC: ", fitAndPredict(model))

Model  1 : RandomForestClassifier(max_depth=13, min_samples_leaf=2, min_samples_split=8,
                       n_estimators=25)
ACC:  0.8163265306122449
Model  2 : GradientBoostingClassifier()
ACC:  0.8299319727891157
Model  3 : <catboost.core.CatBoostClassifier object at 0x7f6200cc3310>
ACC:  0.8129251700680272
Model  4 : LGBMClassifier(n_estimators=20, num_leaves=7)
ACC:  0.826530612244898


## Ensemble of Models

In [None]:
# Сreating a list and adding models
estimator= []
estimator.append(('RFC', RandomForestClassifier(max_depth= 13, min_samples_leaf= 2, min_samples_split=8, n_estimators=25))) 
estimator.append(('GBC', GradientBoostingClassifier())) 
estimator.append(('CBC', CatBoostClassifier(verbose=False))) 
estimator.append(('LGBM',lgbm.LGBMClassifier(learning_rate=0.1, n_estimators=20, num_leaves=7))) 

# Voting Classifier with hard voting 
soft_voting = VotingClassifier(estimators = estimator, voting ='soft') 
soft_voting.fit(X_train, y_train) 
prediction = soft_voting.predict(X_val) 
score = accuracy_score(y_val, prediction) 
print(score) 



0.826530612244898
