In [2]:
# STEP 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score

# STEP 2: Load the datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Store test PassengerIds for submission
test_passenger_ids = test["PassengerId"]

# Separate target variable before combining
y = train["Transported"].astype(int)
X = train.drop(["Transported", "PassengerId"], axis=1)
X_test = test.drop("PassengerId", axis=1)

# STEP 3: Combine train and test for uniform preprocessing (excluding target from train)
combined = pd.concat([X, X_test], ignore_index=True)


# STEP 4: Handle missing values + Label Encoding
for col in combined.columns:
    if combined[col].dtype == 'object' or pd.api.types.is_string_dtype(combined[col]):
        combined[col] = combined[col].fillna("Missing")
        le = LabelEncoder()
        combined[col] = le.fit_transform(combined[col].astype(str))
    else:
        combined[col] = combined[col].fillna(combined[col].median())

# STEP 5: Separate train and test again
X_clean = combined.iloc[:len(X)].copy()
X_test_clean = combined.iloc[len(X):].copy()


# STEP 6: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)
X_test_scaled = scaler.transform(X_test_clean)

# STEP 7: Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# STEP 8: Train models
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_acc = accuracy_score(y_val, lr.predict(X_val))

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train) # Corrected: use y_train
dt_acc = accuracy_score(y_val, dt.predict(X_val))

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_acc = accuracy_score(y_val, rf.predict(X_val))

xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xg.fit(X_train, y_train)
xg_acc = accuracy_score(y_val, xg.predict(X_val))

# STEP 9: Print Accuracy
print("Logistic Regression Accuracy:", lr_acc)
print("Decision Tree Accuracy:", dt_acc)
print("Random Forest Accuracy:", rf_acc)
print("XGBoost Accuracy:", xg_acc)

# STEP 10: Final prediction
final_model = xg  # Choose best
test_preds = final_model.predict(X_test_scaled)

# STEP 11: Submission CSV
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Transported": test_preds.astype(bool)
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv ready for upload!")

Logistic Regression Accuracy: 0.7768832662449684
Decision Tree Accuracy: 0.7377803335250144
Random Forest Accuracy: 0.7975848188614146
XGBoost Accuracy: 0.7952846463484762
✅ submission.csv ready for upload!


Parameters: { "use_label_encoder" } are not used.



In [4]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>