In [1]:
import pandas as pd
import numpy as np
import sklearn 
import tensorflow as tf
import matplotlib as mlt


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:
import xgboost 

In [4]:
df = pd.read_csv(r"train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Definitions

| Variable  | Definition                                           | Key                             |
|-----------|------------------------------------------------------|---------------------------------|
| survival  | Survival                                             | 0 = No, 1 = Yes                 |
| pclass    | Ticket class                                         | 1 = 1st, 2 = 2nd, 3 = 3rd       |
| sex       | Sex                                                  |                                 |
| Age       | Age in years                                         |                                 |
| sibsp     | # of siblings / spouses aboard the Titanic           |                                 |
| parch     | # of parents / children aboard the Titanic           |                                 |
| ticket    | Ticket number                                        |                                 |
| fare      | Passenger fare                                       |                                 |
| cabin     | Cabin number                                         |                                 |
| embarked  | Port of Embarkation                                  | C = Cherbourg, Q = Queenstown, S = Southampton |

Survival is the target variable


In [5]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
df1 = df.copy() 

#Giving numerical values to categorical data
df1['Sex'] = df1['Sex'].map({'male': 1, 'female': 0})
df1['Embarked'] = df1['Embarked'].map({'C':0,'S':1,'Q':2}).astype('Int64')

# Names of passengers doesnt seem relevant to their survival, ticket name also seems irrelevant but Ill check with other submissions later
df1.drop(columns=['Name', 'Ticket'], inplace=True)


In [7]:
df1.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,7.25,,1
1,2,1,1,0,38.0,1,0,71.2833,C85,0


In [8]:
# Null values now

df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# Since 70+ % of values in cabin column is null, Ill drop it and ill fill ages with mean values of men and women respectively

avg_age_men = df1[df1['Sex']==1]['Age'].mean() 
avg_age_women = df1[df1['Sex']==0]['Age'].mean() 
print("Average age of men: ", avg_age_men)
print("Average age of women: ", avg_age_women)

df1.loc[df1['Sex'] == 1, 'Age'] = df1.loc[df1['Sex'] == 1, 'Age'].fillna(avg_age_men)
df1.loc[df1['Sex'] == 0, 'Age'] = df1.loc[df1['Sex'] == 0, 'Age'].fillna(avg_age_women)
df1.drop(columns=['Cabin'], inplace=True)


Average age of men:  30.72664459161148
Average age of women:  27.915708812260537


In [10]:
df1.dropna(inplace=True) #deleting the 2 rows with missing embarked values
df1.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [11]:
df1.drop(columns=['PassengerId'], inplace=True) #passenger id also doesnt seem relevant here so deleting it


In [12]:
df1.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      Int64
dtype: object

In [13]:
#ill be evaluating performance of decision tree,random forest, XGboost, logistic regression and a simple neural network with 3 layers 

x = df1[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y = df1['Survived']

x_train , x_test , y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)


In [14]:
models = ['Decision Tree','Random Forest','XGBoost','Logistic Regression','Custom Neural Network']


In [15]:
# Decsion Tree
model1 = DecisionTreeClassifier(random_state=42)
model1.fit(x_train,y_train)


In [16]:

preds1 = model1.predict(x_test)
ac1 = accuracy_score(y_test,preds1)
print(ac1)



0.7696629213483146


In [17]:
#Random Forest
model2 = RandomForestClassifier(n_estimators=1000,random_state=42)
model2.fit(x_train,y_train)
preds2=model2.predict(x_test)
ac2 = accuracy_score(y_test,preds2)
print(ac2)


0.7528089887640449


In [18]:
# XGBoost
model3 = xgboost.XGBClassifier(random_state=42,
                               objective='binary:logistic',
                               learning_rate=0.01,
                               n_estimators=1000,)
model3.fit(x_train,y_train)
preds3 = model3.predict(x_test)
ac3 = accuracy_score(y_test,preds3)
print(ac3)

0.8089887640449438


In [19]:
#Logistic Regression
model4 = LogisticRegression(random_state=42)
model4.fit(x_train,y_train)
preds4 = model4.predict(x_test)
ac4 = accuracy_score(y_test,preds4)
print(ac4)

0.7865168539325843


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
#Neural Network: initially ill make a network with 3 layers and 64 units in the first 2 layers, depending on performance ill optimize it

model5 = tf.keras.Sequential([
        tf.keras.layers.Dense(64,activation='relu' ),
        tf.keras.layers.Dense(64,activation='relu' ),
        tf.keras.layers.Dense(1, activation='sigmoid')
])

model5.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [21]:

model5.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5536 - loss: 1.9159 - val_accuracy: 0.6742 - val_loss: 0.7916
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6829 - loss: 0.7596 - val_accuracy: 0.6685 - val_loss: 0.6427
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6593 - loss: 0.6092 - val_accuracy: 0.6292 - val_loss: 0.6205
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6818 - loss: 0.5929 - val_accuracy: 0.6742 - val_loss: 0.5967
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6955 - loss: 0.5848 - val_accuracy: 0.6854 - val_loss: 0.5908
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7151 - loss: 0.5709 - val_accuracy: 0.6854 - val_loss: 0.5919
Epoch 7/10
[1m23/23[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c0ef552e90>

In [22]:
test_loss, test_accuracy = model5.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000us/step - accuracy: 0.7173 - loss: 0.6184
Test Loss: 0.6266
Test Accuracy: 0.7191


In [23]:
# New neural network with one more layer just to see
model6 = tf.keras.Sequential([
        tf.keras.layers.Dense(64,activation='relu' ),
        tf.keras.layers.Dense(128,activation='relu' ),
        tf.keras.layers.Dense(128,activation='relu' ),
        
        tf.keras.layers.Dense(1, activation='sigmoid')
])

model6.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [24]:
model6.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6281 - loss: 0.7329 - val_accuracy: 0.6854 - val_loss: 0.5873
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6951 - loss: 0.6020 - val_accuracy: 0.6798 - val_loss: 0.5786
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6637 - loss: 0.6222 - val_accuracy: 0.6798 - val_loss: 0.5874
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6948 - loss: 0.5957 - val_accuracy: 0.7303 - val_loss: 0.5659
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7117 - loss: 0.5965 - val_accuracy: 0.7079 - val_loss: 0.5765
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7236 - loss: 0.5843 - val_accuracy: 0.7360 - val_loss: 0.5775
Epoch 7/10
[1m23/23[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c0f8f803d0>

In [25]:
test_loss, test_accuracy = model6.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7632 - loss: 0.5994 
Test Loss: 0.6103
Test Accuracy: 0.7528


For now XGBoost has the highest accuracy, ill try tweaking the neural network for better results in future. But for now im going to use xgboost to predict test.csv


In [33]:
test_data = pd.read_csv('test.csv')
test_data.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [28]:
#formatting the data according to model parameters
df_test = test_data.copy()
df_test['Sex'] = df_test['Sex'].map({'male': 1, 'female': 0})
df_test['Embarked'] = df_test['Embarked'].map({'C':0,'S':1,'Q':2}).astype('Int64')

In [29]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [30]:


df_test.loc[df_test['Sex'] == 1, 'Age'] = df_test.loc[df_test['Sex'] == 1, 'Age'].fillna(avg_age_men)
df_test.loc[df_test['Sex'] == 0, 'Age'] = df_test.loc[df_test['Sex'] == 0, 'Age'].fillna(avg_age_women)
df_test.drop(columns=['Cabin'], inplace=True)

In [31]:
df_test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
dtype: int64

In [32]:
df_test.dropna()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",1,34.500000,0,0,330911,7.8292,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.000000,1,0,363272,7.0000,1
2,894,2,"Myles, Mr. Thomas Francis",1,62.000000,0,0,240276,9.6875,2
3,895,3,"Wirz, Mr. Albert",1,27.000000,0,0,315154,8.6625,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.000000,1,1,3101298,12.2875,1
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,30.726645,0,0,A.5. 3236,8.0500,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.000000,0,0,PC 17758,108.9000,0
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,1
416,1308,3,"Ware, Mr. Frederick",1,30.726645,0,0,359309,8.0500,1


In [34]:
x2 = df_test[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
ids = df_test["PassengerId"]

predictions =model3.predict(x2)


In [35]:

results = pd.DataFrame(ids,columns=["PassengerId"])
results['Survived'] = predictions

In [36]:
results.to_csv('submission1.csv',index=False)