In [23]:
import pandas as pd

In [24]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace({'Mlle': 'Miss',
                                   'Ms': 'Miss',
                                   'Mme': 'Mrs'})
test['Title'] = test['Title'].replace({'Mlle': 'Miss',
                                   'Ms': 'Miss',
                                   'Mme': 'Mrs'})
rare_titles = ['Lady', 'Countess', 'Capt', 'Don', 'Sir', 'Jonkheer']

train['Title'] = train['Title'].replace(rare_titles, 'Rare')
test['Title'] = test['Title'].replace(rare_titles, 'Rare')

In [26]:
train['TicketPrefix'] = train['Ticket'].str.extract(r'([A-Za-z\.\/]+)')
test['TicketPrefix'] = test['Ticket'].str.extract(r'([A-Za-z\.\/]+)')
train['TicketPrefix'] = train['TicketPrefix'].fillna('NoPrefix')
test['TicketPrefix'] = test['TicketPrefix'].fillna('NoPrefix')
train['TicketPrefix'] = train['TicketPrefix'].apply(lambda x:x if train['TicketPrefix'].value_counts()[x] > 1 else 'Rare')
test['TicketPrefix'] = test['TicketPrefix'].apply(lambda x:x if test['TicketPrefix'].value_counts()[x] > 1 else 'Rare')

In [27]:
ticket_counts = train['Ticket'].value_counts()
ticket_counts1 = test['Ticket'].value_counts()
test['TicketGroupSize'] = test['Ticket'].map(ticket_counts1)
train['TicketGroupSize'] = train['Ticket'].map(ticket_counts)

In [28]:
train['Deck'] = train['Cabin'].str[0]
test['Deck'] = test['Cabin'].str[0]

In [29]:
p_id = test['PassengerId']
test.drop(['PassengerId','Ticket','Cabin','Name'],axis=1,inplace=True)
train.drop(['PassengerId','Ticket','Cabin','Name'],axis=1,inplace=True)

In [30]:
train = pd.get_dummies(train,columns=['Sex','Title','TicketPrefix'],drop_first=True,dtype=int)
test = pd.get_dummies(test,columns=['Sex','Title','TicketPrefix'],drop_first=True,dtype=int)

In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import numpy as np

le = LabelEncoder()
l1 = LabelEncoder()

non_null_deck = train['Deck'][train['Deck'].notnull()]
non_null_em = train['Embarked'][train['Embarked'].notnull()]

le.fit(non_null_deck)
l1.fit(non_null_em)

train['Deck'] = train['Deck'].apply(lambda x: le.transform([x])[0] if pd.notnull(x) else np.nan)
test['Deck'] = test['Deck'].apply(lambda x: le.transform([x])[0] if pd.notnull(x) else np.nan)
train['Embarked'] = train['Embarked'].apply(lambda x: l1.transform([x])[0] if pd.notnull(x) else np.nan)
test['Embarked'] = test['Embarked'].apply(lambda x: l1.transform([x])[0] if pd.notnull(x) else np.nan)
feature = train.columns.tolist()
feature.remove('Survived')
feature1 = test.columns.tolist()

imputer = KNNImputer(n_neighbors=13)

train[feature] = imputer.fit_transform(train[feature])
test[feature1] = imputer.fit_transform(test[feature1])
train['Deck'] = train['Deck'].round().astype(int)
test['Deck'] = test['Deck'].round().astype(int)
train['Embarked'] = train['Embarked'].round().astype(int)
test['Embarked'] = test['Embarked'].round().astype(int)

In [32]:
train['Deck'] = train['Deck'].apply(lambda x: le.classes_[x])
test['Deck'] = test['Deck'].apply(lambda x: le.classes_[x])
train['Embarked'] = train['Embarked'].apply(lambda x: l1.classes_[x])
test['Embarked'] = test['Embarked'].apply(lambda x: l1.classes_[x])

In [33]:
train = pd.get_dummies(train,columns=['Deck','Embarked'],drop_first=True,dtype=float)
test = pd.get_dummies(test,columns=['Deck','Embarked'],drop_first=True,dtype=float)

In [34]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
train['IsAlone'] = 0
test['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [35]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

In [36]:
test = test.reindex(columns=X_train.columns, fill_value=0)

In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['Age','Fare']] = scaler.fit_transform(X_train[['Age','Fare']])
test[['Age','Fare']] = scaler.transform(test[['Age','Fare']])

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
# Random Forest
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)


In [40]:
predictions = forest_clf.predict(test).astype(int)

In [41]:
submission = pd.DataFrame({
    "PassengerId": p_id,
    "Survived": predictions
})

submission.to_csv("titanic_predictions.csv", index=False)


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

forest_clf.fit(X_tr, y_tr)
val_pred = forest_clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_pred))


Validation Accuracy: 0.8324022346368715
