In [None]:
# Titanic Survival Prediction - End-to-End ML Project

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------------- Download Datasets ----------------
def download_file(url, filename):
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {filename}")

# URLs for the Titanic dataset files from Kaggle (example)
train_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" # Using a publicly available version for demonstration
test_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" # Note: This is not the actual test set for the competition, using for structure demonstration

download_file(train_url, "train.csv")
download_file(test_url, "test.csv")


# ---------------- Load Dataset ----------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ---------------- Preprocessing ----------------
# Drop unneeded columns
train = train.drop(["Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

# Separate features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Identify categorical & numerical features
categorical = ["Sex", "Embarked", "Pclass"]
numerical = ["Age", "SibSp", "Parch", "Fare"]

# Preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical),
        ("cat", categorical_transformer, categorical)
    ]
)

# ---------------- Logistic Regression Model ----------------
logreg_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                  ("classifier", LogisticRegression(max_iter=1000))])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_val)

print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# ---------------- Random Forest Model ----------------
rf_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                              ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_val)

print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(confusion_matrix(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

# ---------------- Cross Validation ----------------
cv_scores = cross_val_score(rf_pipeline, X, y, cv=5)
print("Random Forest CV Accuracy:", cv_scores.mean())

# ---------------- Predict on Test Set for Kaggle ----------------
rf_pipeline.fit(X, y)
# Note: This part uses the downloaded 'test.csv' which is a copy of 'train.csv'
# For a real Kaggle submission, you would need the actual test dataset.
# This code will run but the submission file will not be valid for the competition.
test_pred = rf_pipeline.predict(test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_pred
})

submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

Downloaded train.csv
Downloaded test.csv
Logistic Regression Accuracy: 0.7988826815642458
[[90 15]
 [21 53]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Random Forest Accuracy: 0.8212290502793296
[[90 15]
 [17 57]]
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       105
           1       0.79      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Random Forest CV Accuracy: 0.8148452702278576
Submission file created: submission.csv


In [None]:
# Titanic Survival Prediction - End-to-End ML Project

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------------- Download Datasets ----------------
def download_file(url, filename):
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {filename}")

# URLs for the Titanic dataset files from Kaggle (example)
train_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" # Using a publicly available version for demonstration
test_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" # Note: This is not the actual test set for the competition, using for structure demonstration

download_file(train_url, "train.csv")
download_file(test_url, "test.csv")


# ---------------- Load Dataset ----------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ---------------- Preprocessing ----------------
# Drop unneeded columns
train = train.drop(["Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

# Separate features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Identify categorical & numerical features
categorical = ["Sex", "Embarked", "Pclass"]
numerical = ["Age", "SibSp", "Parch", "Fare"]

# Preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical),
        ("cat", categorical_transformer, categorical)
    ]
)

# ---------------- Logistic Regression Model ----------------
logreg_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                  ("classifier", LogisticRegression(max_iter=1000))])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_val)

print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# ---------------- Random Forest Model ----------------
rf_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                              ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_val)

print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(confusion_matrix(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

# ---------------- Cross Validation ----------------
cv_scores = cross_val_score(rf_pipeline, X, y, cv=5)
print("Random Forest CV Accuracy:", cv_scores.mean())

# ---------------- Predict on Test Set for Kaggle ----------------
rf_pipeline.fit(X, y)
# Note: This part uses the downloaded 'test.csv' which is a copy of 'train.csv'
# For a real Kaggle submission, you would need the actual test dataset.
# This code will run but the submission file will not be valid for the competition.
test_pred = rf_pipeline.predict(test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_pred
})

submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

Downloaded train.csv
Downloaded test.csv
Logistic Regression Accuracy: 0.7988826815642458
[[90 15]
 [21 53]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Random Forest Accuracy: 0.8212290502793296
[[90 15]
 [17 57]]
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       105
           1       0.79      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Random Forest CV Accuracy: 0.8148452702278576
Submission file created: submission.csv


In [None]:
# Titanic Survival Prediction - End-to-End ML Project

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------------- Download Datasets ----------------
def download_file(url, filename):
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {filename}")

# URLs for the Titanic dataset files from Kaggle (example)
train_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" # Using a publicly available version for demonstration
test_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" # Note: This is not the actual test set for the competition, using for structure demonstration

download_file(train_url, "train.csv")
download_file(test_url, "test.csv")


# ---------------- Load Dataset ----------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ---------------- Preprocessing ----------------
# Drop unneeded columns
train = train.drop(["Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

# Separate features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Identify categorical & numerical features
categorical = ["Sex", "Embarked", "Pclass"]
numerical = ["Age", "SibSp", "Parch", "Fare"]

# Preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical),
        ("cat", categorical_transformer, categorical)
    ]
)

# ---------------- Logistic Regression Model ----------------
logreg_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                  ("classifier", LogisticRegression(max_iter=1000))])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_val)

print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# ---------------- Random Forest Model ----------------
rf_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                              ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_val)

print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(confusion_matrix(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

# ---------------- Cross Validation ----------------
cv_scores = cross_val_score(rf_pipeline, X, y, cv=5)
print("Random Forest CV Accuracy:", cv_scores.mean())

# ---------------- Predict on Test Set for Kaggle ----------------
rf_pipeline.fit(X, y)
# Note: This part uses the downloaded 'test.csv' which is a copy of 'train.csv'
# For a real Kaggle submission, you would need the actual test dataset.
# This code will run but the submission file will not be valid for the competition.
test_pred = rf_pipeline.predict(test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_pred
})

submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

Downloaded train.csv
Downloaded test.csv
Logistic Regression Accuracy: 0.7988826815642458
[[90 15]
 [21 53]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Random Forest Accuracy: 0.8212290502793296
[[90 15]
 [17 57]]
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       105
           1       0.79      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Random Forest CV Accuracy: 0.8148452702278576
Submission file created: submission.csv
