In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier


In [31]:
data = pd.read_csv('train.csv')
result = data['Survived']
data.drop('PassengerId', axis=1, inplace=True)
data.drop('Survived', axis=1, inplace=True)
data.drop('Ticket', axis=1, inplace=True)

data['Survived'] = result
X = data.iloc[:, :-1]
y_train = data.iloc[:, -1]
test_data = pd.read_csv('test.csv')
test_data.drop('PassengerId', axis=1, inplace=True)
X_test = test_data.iloc[:, :]

# Process the 'Name' feature
X['Name'] = X['Name'].apply(lambda x: x.split(", ")[1][:4])

# Handle missing values in 'Cabin'
X['Cabin'] = X['Cabin'].fillna('Missing')
X['Cabin'] = X['Cabin'].apply(lambda x: x[0] if x != 'Missing' else 'M')

# Handle missing values in 'Ticket' (if needed)
# X['Ticket'] = X['Ticket'].fillna(np.nan)

# Check the updated dataframe


In [32]:
categorical_features = ['Name','Sex','Embarked','Cabin']
imputer = SimpleImputer(strategy='most_frequent')
X.loc[:,categorical_features] = imputer.fit_transform(X[categorical_features])
X_test.loc[:,categorical_features] = imputer.transform(X_test[categorical_features])
imputer = SimpleImputer(strategy='median')
X.loc[:,'Age'] = imputer.fit_transform(X[['Age']])
X_test.loc[:,'Age'] = imputer.transform(X_test[['Age']])
X.loc[:,'Fare'] = imputer.fit_transform(X[['Fare']])
X_test.loc[:,'Fare'] = imputer.transform(X_test[['Fare']])
X = X.copy()
X_test = X_test.copy()
X.loc[:,'FamilySize'] = X.loc[:,'SibSp'] + X.loc[:,'Parch'] + 1
X.loc[:,'IsAlone'] = (X.loc[:,'FamilySize']==1).astype(float)
X_test.loc[:,'FamilySize'] = X_test.loc[:,'SibSp'] + X_test.loc[:,'Parch'] + 1
X_test.loc[:,'IsAlone'] = (X_test.loc[:,'FamilySize']==1).astype(float)
X = pd.get_dummies(X,columns=['Sex','Embarked','Name','Cabin'])
X_test = pd.get_dummies(X_test,columns=['Sex','Embarked','Name','Cabin'])
X_test = X_test.reindex(columns=X.columns,fill_value=0)




In [33]:
scalar = StandardScaler()
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch','FamilySize']
X[numerical_features] = X[numerical_features].astype(float)
X_test[numerical_features] = X_test[numerical_features].astype(float)
X.loc[:,numerical_features] = scalar.fit_transform(X[numerical_features])
X_test.loc[:,numerical_features] = scalar.transform(X_test[numerical_features])

In [34]:
poly = PolynomialFeatures()
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(X_test)

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X_poly,y_train,test_size=0.2)

In [36]:
reg = LogisticRegression(C=1, max_iter=1000, random_state=1)
reg.fit(X_train, y_train)
y_pred_val = reg.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
cm = confusion_matrix(y_val, y_pred_val)
print("Logistic Regression:")
print(f" - Accuracy: {accuracy:.4f}")
print(cm)



rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(X_train, y_train)
y_pred_val = rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print("Random Forest:")
print(f" - Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)


xgb = XGBClassifier(n_estimators=100, random_state=1)
xgb.fit(X_train, y_train)
y_pred_val = xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print("XGBoost:")
print(f" - Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)

svm = SVC(probability=True, random_state=1)
svm.fit(X_train, y_train)
y_pred_val = svm.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"SVM Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)


ada = AdaBoostClassifier(n_estimators=100, random_state=1)
ada.fit(X_train, y_train)
y_pred_val = ada.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"AdaBoost Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)




knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_val = knn.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"KNN Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)


mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=1)
mlp.fit(X_train, y_train)
y_pred_val = mlp.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"MLP Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)




voting_clf = VotingClassifier(estimators=[
    ('lr', LogisticRegression(C=1, max_iter=1000, random_state=1)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=1)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=1))
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred_val = voting_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Voting Classifier Accuracy: {accuracy:.4f}")
cm = confusion_matrix(y_val, y_pred_val)
print(cm)

Logistic Regression:
 - Accuracy: 0.8492
[[101  11]
 [ 16  51]]
Random Forest:
 - Accuracy: 0.8603
[[100  12]
 [ 13  54]]
XGBoost:
 - Accuracy: 0.8324
[[94 18]
 [12 55]]
SVM Accuracy: 0.8715
[[99 13]
 [10 57]]




AdaBoost Accuracy: 0.8324
[[100  12]
 [ 18  49]]
KNN Accuracy: 0.8212
[[96 16]
 [16 51]]
MLP Accuracy: 0.8212
[[95 17]
 [15 52]]
Voting Classifier Accuracy: 0.8659
[[102  10]
 [ 14  53]]


In [37]:
y_pred = svm.predict(X_test_poly)

In [38]:
result = pd.DataFrame(
    {
        "PassengerId":list(range(892,892+len(y_pred))),
        "Survived" : y_pred
    }
    )
result.to_csv("submission.csv",index=False)