In [736]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [737]:
data = pd.read_csv('train.csv')
X = data.iloc[:,[2,4,5,6,7,9,11]]
y_train = data.iloc[:,1]
test_data = pd.read_csv('test.csv')
X_test = test_data.iloc[:,[1,3,4,5,6,8,10]]



In [738]:
categorical_features = X.select_dtypes(include=['object','category']).columns
imputer = SimpleImputer(strategy='constant',fill_value='Missing')
X.loc[:,categorical_features] = imputer.fit_transform(X[categorical_features])
X_test.loc[:,categorical_features] = imputer.transform(X_test[categorical_features])
imputer = SimpleImputer(strategy='mean')
X.loc[:,'Age'] = imputer.fit_transform(X[['Age']])
X_test.loc[:,'Age'] = imputer.transform(X_test[['Age']])
X.loc[:,'Fare'] = imputer.fit_transform(X[['Fare']])
X_test.loc[:,'Fare'] = imputer.transform(X_test[['Fare']])
X = X.copy()
X_test = X_test.copy()
X.loc[:,'FamilySize'] = X.loc[:,'SibSp'] + X.loc[:,'Parch'] + 1
X.loc[:,'IsAlone'] = (X.loc[:,'FamilySize']==1).astype(float)
X_test.loc[:,'FamilySize'] = X_test.loc[:,'SibSp'] + X_test.loc[:,'Parch'] + 1
X_test.loc[:,'IsAlone'] = (X_test.loc[:,'FamilySize']==1).astype(float)
X = pd.get_dummies(X,columns=['Sex','Embarked']).astype(float)
X_test = pd.get_dummies(X_test,columns=['Sex','Embarked']).astype(float)
X_test = X_test.reindex(columns=X.columns,fill_value=0)




In [739]:
scalar = StandardScaler()
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
X.loc[:,numerical_features] = scalar.fit_transform(X[numerical_features])
X_test.loc[:,numerical_features] = scalar.transform(X_test[numerical_features])

In [740]:
poly = PolynomialFeatures()
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(X_test)

In [741]:
X_train, X_val, y_train, y_val = train_test_split(X_poly,y_train,test_size=0.2,random_state=1)

In [742]:
# Evaluate Logistic Regression
reg = LogisticRegression(C=1, max_iter=1000, random_state=1)
reg.fit(X_train, y_train)
y_pred_val = reg.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, reg.predict_proba(X_val)[:, 1])
print("Logistic Regression:")
print(f" - Accuracy: {accuracy:.4f}")
print(f" - ROC AUC: {roc_auc:.4f}")
print(f" - Confusion Matrix:\n{confusion_matrix(y_val, y_pred_val)}")
print(f" - Classification Report:\n{classification_report(y_val, y_pred_val)}")
print("\n")

# Evaluate Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(X_train, y_train)
y_pred_val = rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, rf.predict_proba(X_val)[:, 1])
print("Random Forest:")
print(f" - Accuracy: {accuracy:.4f}")
print(f" - ROC AUC: {roc_auc:.4f}")
print(f" - Confusion Matrix:\n{confusion_matrix(y_val, y_pred_val)}")
print(f" - Classification Report:\n{classification_report(y_val, y_pred_val)}")
print("\n")

# Evaluate XGBoost
xgb = XGBClassifier(n_estimators=100, random_state=1)
xgb.fit(X_train, y_train)
y_pred_val = xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
roc_auc = roc_auc_score(y_val, xgb.predict_proba(X_val)[:, 1])
print("XGBoost:")
print(f" - Accuracy: {accuracy:.4f}")
print(f" - ROC AUC: {roc_auc:.4f}")
print(f" - Confusion Matrix:\n{confusion_matrix(y_val, y_pred_val)}")
print(f" - Classification Report:\n{classification_report(y_val, y_pred_val)}")


Logistic Regression:
 - Accuracy: 0.7765
 - ROC AUC: 0.8242
 - Confusion Matrix:
[[98  8]
 [32 41]]
 - Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.92      0.83       106
           1       0.84      0.56      0.67        73

    accuracy                           0.78       179
   macro avg       0.80      0.74      0.75       179
weighted avg       0.79      0.78      0.77       179



Random Forest:
 - Accuracy: 0.7821
 - ROC AUC: 0.8346
 - Confusion Matrix:
[[93 13]
 [26 47]]
 - Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       106
           1       0.78      0.64      0.71        73

    accuracy                           0.78       179
   macro avg       0.78      0.76      0.77       179
weighted avg       0.78      0.78      0.78       179



XGBoost:
 - Accuracy: 0.7765
 - ROC AUC: 0.8367
 - Confusion Matrix:
[[94 12]
 [28 45]]
 - Class

In [743]:
# y_pred = rf.predict(X_test_poly)

In [744]:
# result = pd.DataFrame(
#     {
#         "PassengerId":list(range(892,892+len(y_pred))),
#         "Survived" : y_pred
#     }
#     )
# result.to_csv("submission.csv",index=False)