In [1]:
import pandas as pd
import numpy as np
test_path = "/Users/ataberkcinetci/Desktop/titanic/test.csv"
train_path = "/Users/ataberkcinetci/Desktop/titanic/train.csv"

In [2]:
import numpy as np
test = pd.read_csv(test_path)
train = pd.read_csv(train_path)

In [3]:
full = pd.concat([train, test], sort=False, ignore_index=True)
full.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [4]:
full["Name"]

0                                 Braund, Mr. Owen Harris
1       Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                  Heikkinen, Miss. Laina
3            Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                                Allen, Mr. William Henry
                              ...                        
1304                                   Spector, Mr. Woolf
1305                         Oliva y Ocana, Dona. Fermina
1306                         Saether, Mr. Simon Sivertsen
1307                                  Ware, Mr. Frederick
1308                             Peter, Master. Michael J
Name: Name, Length: 1309, dtype: object

In [5]:
full['Title'] = full['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
full['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [6]:
full['Title'] = full['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                       'Don', 'Dr', 'Major', 'Rev', 'Sir',\
                                       'Jonkheer', 'Dona'], 'Rare')
full['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare', 'Mme', 'Ms', 'Mlle'],
      dtype=object)

In [7]:
title_mapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
full['Title'] = full['Title'].map(title_mapping)
full['Title'] = full['Title'].fillna(0)

In [8]:
full['Title'].isna().sum()

0

In [9]:
full['Sex'] = full['Sex'].map({'male': 0, 'female': 1})

In [10]:
full['Embarked'] = full['Embarked'].fillna(full['Embarked'].mode()[0])
full['Embarked'] = full['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [11]:
full['Fare'] = full['Fare'].fillna(full['Fare'].median())

In [12]:
full['Age'] = full['Age'].fillna(full['Age'].median())

In [13]:
full['FamilySize'] = full['SibSp'] + full['Parch'] + 1
full['IsAlone'] = 0
full.loc[full['FamilySize'] == 1, 'IsAlone'] = 1

In [14]:
full['AgeBin'] = pd.cut(full['Age'], bins=[0, 12, 20, 40, 60, 80], labels=[0, 1, 2, 3, 4])
full['AgeBin'] = full['AgeBin'].astype(float)


In [15]:
full['FareBin'] = pd.qcut(full['Fare'], 4, labels=[0, 1, 2, 3])
full['FareBin'] = full['FareBin'].astype(float)

In [16]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
            'Title', 'FamilySize', 'IsAlone', 'AgeBin',]

In [17]:
y_train = train['Survived']
X_train = full.loc[:len(train)-1, features]
X_test = full.loc[len(train):, features]

In [18]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [19]:
predictions = model.predict(X_test)

In [20]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_updated.csv', index=False)

print("Done! Your submission.csv is ready.")

Done! Your submission.csv is ready.


## with random forest classifier we get 0.75 accuracy nothing significantly changes now we use hyperparameter tunning


In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6, None],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [23]:
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,            # 5-fold cross-validation
    n_jobs=-1,       # Use all CPUs
    scoring='accuracy'
)
grid_search.fit(X_train, y_train)

In [24]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.8395267089322704


In [25]:
best_rf = grid_search.best_estimator_
predictions = best_rf.predict(X_test)

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)


## now it increased to 0.77272

## we try different models starting with Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)

##accuracy is 0.79425 after using logistic regression

In [29]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=4, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
predictions = xgb_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)

##accuracy downed to 0.7606 after using XGBoost

In [36]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(n_estimators=100, max_depth=4, random_state=42)
lgb_model.fit(X_train, y_train)
predictions = lgb_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)

##accuracy is 0.7694 after using LightGBM


[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 230
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [40]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', probability=True, random_state=42)
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)


##accuracy is downed to 0.66