## Kaggle Titanic compitation

In [3]:
#importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

### Data Cleaning

In [4]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
train_data.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [8]:
full_data = [train_data, test_data]

In [9]:
# Feature Engineering
# Extract titles from names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Replace titles with more common groups
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Map titles to numeric values
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
for dataset in full_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [10]:
# Filling Null Values
for dataset in full_data:
  dataset['Embarked'] = dataset['Embarked'].fillna(method = 'ffill')
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)

# Fill missing Age data using median grouped by Title and Pclass
train_data['Age'].fillna(train_data.groupby('Title')['Age'].transform('median'), inplace=True)
test_data['Age'].fillna(test_data.groupby('Title')['Age'].transform('median'), inplace=True)


  dataset['Embarked'] = dataset['Embarked'].fillna(method = 'ffill')


In [11]:
train_data.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [12]:
# Adding new Features(columns)
for dataset in full_data:
  dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [13]:
# Creating Age bands
for dataset in full_data:
    dataset['AgeBin'] = pd.cut(dataset['Age'], bins=[0, 12, 18, 30, 50, 80], labels=[0, 1, 2, 3, 4])

# Creating Fare bands
for dataset in full_data:
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4, labels=[0, 1, 2, 3])

embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

# Encoding our Categorical data
label_encoder = LabelEncoder()
for dataset in full_data:
    dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

In [14]:
# Droping unnecessary columns
drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Age', 'Fare']
train_data = train_data.drop(drop_columns, axis=1)
test_data = test_data.drop(drop_columns, axis=1)

In [15]:
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data.copy()

In [16]:
# Standard Scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
train_data.corrwith(train_data['Survived'])

Unnamed: 0,0
Survived,1.0
Pclass,-0.338481
Sex,-0.543351
Embarked,0.113083
Title,0.407753
FamilySize,0.016639
AgeBin,-0.046557
FareBin,0.299357


### Creating Model

In [18]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_predict = decision_tree.predict(X_test)

In [19]:
# Evaluating using cross-validation
scores = cross_val_score(decision_tree, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean()}')

Cross-validation accuracy: 0.8181909484652564


In [20]:
#RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [21]:
scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean()}')

Cross-validation accuracy: 0.8216056744711568


In [22]:
# Model Building - Random Forest and Gradient Boosting
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
# Voting Classifier for Ensembling
voting_clf = VotingClassifier(estimators=[('rf', rf), ('gb', gb)], voting='soft')

# Train the model
voting_clf.fit(X_train, y_train)

# Evaluate using cross-validation
scores = cross_val_score(voting_clf, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean()}')

# Make predictions
predictions = voting_clf.predict(X_test)

# Prepare submission
submission = pd.read_csv('gender_submission.csv')
submission['Survived'] = predictions
submission.to_csv('submission.csv', index=False)

Cross-validation accuracy: 0.8238277572029377


In [25]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
randomized_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=param_dist, n_iter=10, cv=5)
randomized_search.fit(X_train, y_train)
best_model = randomized_search.best_estimator_





In [26]:
scores = cross_val_score(randomized_search, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean()}')



Cross-validation accuracy: 0.8283033080158182


In [36]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None, # Changed max_features to None (default)
            min_impurity_decrease=0.0,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
random_forest.fit(X_train, y_train)
Y_pred_rf = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)

print("Important features")
print('__'*30)
print(acc_random_forest)

Important features
____________________________________________________________
89.34


In [38]:
# Prepare submission
submission = pd.read_csv('gender_submission.csv')
submission['Survived'] = Y_pred_rf
submission.to_csv('submission1.csv', index=False)

In [39]:
df = pd.read_csv('submissionx.csv')
df['Survived'].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,256
1,162
