In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test["PassengerId"]

In [None]:

# Check available columns
print("Train columns:", train.columns)
print("Test columns:", test.columns)

# Drop columns that are present
cols_to_drop = ["Ticket", "Cabin", "Name", "PassengerId"]

# Only drop columns that exist in the DataFrame
train = train.drop(columns=[col for col in cols_to_drop if col in train.columns], axis=1)
test = test.drop(columns=[col for col in cols_to_drop if col in test.columns], axis=1)


y=train["Survived"]
X=train.drop("Survived",axis=1)




In [None]:
# Encode categorical variables
le = preprocessing.LabelEncoder()
cols = ["Sex", "Embarked"]
for col in cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    print(f"Classes for {col}: {le.classes_}")

In [91]:

# Preprocessing Pipelines
numeric_features = ["SibSp", "Parch", "Age", "Fare"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ["Sex", "Embarked"]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='U')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [92]:
print(train.head())
print(test.head())
train.isnull.sum()

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         2
1         1       1    0  38.0      1      0  71.2833         0
2         1       3    0  26.0      0      0   7.9250         2
3         1       1    0  35.0      1      0  53.1000         2
4         0       3    1  35.0      0      0   8.0500         2
   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    1  34.5      0      0   7.8292         1
1       3    0  47.0      1      0   7.0000         2
2       2    1  62.0      0      0   9.6875         1
3       3    1  27.0      0      0   8.6625         2
4       3    0  22.0      1      1  12.2875         2


In [93]:
# Define the model
model = LogisticRegression(max_iter=100)

# Create a pipeline that combines preprocessing and modeling
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Use cross-validation to evaluate the model
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {cv_scores.mean():.4f}')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'ROC-AUC: {roc_auc:.4f}')

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best params: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.4f}')

# Use the best model to predict the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Final evaluation
final_accuracy = accuracy_score(y_test, y_pred_best)
print(f'Final Accuracy with best model: {final_accuracy:.4f}')

Cross-validation accuracy: 0.7890
Accuracy: 0.7877
Precision: 0.7727
Recall: 0.6892
ROC-AUC: 0.7732
Best params: {'classifier__C': 1, 'classifier__solver': 'liblinear'}
Best cross-validation accuracy: 0.7893
Final Accuracy with best model: 0.7877


In [74]:
predictions = clf.predict(test)

In [75]:
df = pd.DataFrame({
    "PassengerId":test_ids.values,
    "Survived":predictions
})

In [76]:
df.to_csv("titanicsubmission2.csv",index=False)

In [34]:
import pickle

filename='titanic2.pkl'
pickle.dump(clf,open(filename,'wb'))