In [17]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Load Titanic Dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Select Relevant Features
df.keys() # Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],dtype='object')
df = df[["Pclass","Age","Sex","Fare","Embarked", "Survived"]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Age       714 non-null    float64
 2   Sex       891 non-null    object 
 3   Fare      891 non-null    float64
 4   Embarked  889 non-null    object 
 5   Survived  891 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 41.9+ KB


In [18]:
# Handling Missing Values
df.fillna({"Age":df["Age"].median()}, inplace=True)
df.fillna({"Embarked":df["Embarked"].mode()[0]}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Age       891 non-null    float64
 2   Sex       891 non-null    object 
 3   Fare      891 non-null    float64
 4   Embarked  891 non-null    object 
 5   Survived  891 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 41.9+ KB


In [19]:
# Selecting Features and Target
X = df.drop(columns=['Survived'])
y = df["Survived"]

In [20]:
# Apply Feature Scaling and Encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ["Age", "Fare"]),
        ('cat', OneHotEncoder(), ["Pclass", "Sex", "Embarked"])
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

In [21]:
# Train and Evaluate Logistic Regression
log_model = LogisticRegression()
log_scores = cross_val_score(log_model, X_preprocessed, y, cv=5, scoring='accuracy')
print(f"Logistic Regression Accuracy: {log_scores.mean():.2f}")

# Train and Evaluate Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_scores = cross_val_score(rf_model, X_preprocessed, y, scoring='accuracy', cv=5)
print(f"Random Forest Accuracy: {rf_scores.mean():.2f}")

Logistic Regression Accuracy: 0.79
Random Forest Accuracy: 0.81


In [22]:
# Define Hyperparameter Grid 
param_grid = {
    "n_estimators":[50, 100, 200],
    "max_depth":[None, 10, 20],
    "min_samples_split":[2, 5, 10]
}

# Perform Grid Search CV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=5)

# Fitting GridSearch CV Model
grid_search.fit(X_preprocessed, y)

# Display best Hyperparameters nd scores
print(f"Best Hyperparameter: {grid_search.best_params_}")
print(f"Best Accuracy or Score: {grid_search.best_score_:.2f}")

Best Hyperparameter: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy or Score: 0.83
