# Task 3: Introduction to Machine Learning

## Section 1: Setup & Dataset

### **Task 1**: Load the Dataset

*Instruction*: Load the preprocessed Titanic dataset (from the previous module or load again if needed). Separate it into features (`X`) and target (`y`, where target = `Survived`).

In [None]:
import pandas as pd

# Load the preprocessed dataset (replace 'your_file.csv' with the actual filename)
data = pd.read_csv('titanic.csv')

# Separate features (X) and target (y)
X = data.drop('Survived', axis=1)  # Features are all columns except 'Survived'
y = data['Survived']  # Target is the 'Survived' column














FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'

## Section 2: Splitting the Data

### **Task 2**: Train/Test Split

*Instruction*:

Split the dataset into training and testing sets (80/20 split).


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'X' is your feature data and 'y' is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train and y_train will be the training sets
# X_test and y_test will be the testing sets


## Section 3: Train Your First Model

### **Task 3**: Logistic Regression

*Instruction*: Train a Logistic Regression model on the Titanic dataset. Display accuracy on both train and test sets.



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the training and testing data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Preprocess the data
# Handle missing values (example: fill missing ages with the mean)
train_data["Age"].fillna(train_data["Age"].mean(), inplace=True)
test_data["Age"].fillna(test_data["Age"].mean(), inplace=True)

# One-hot encode categorical features
train_data = pd.get_dummies(train_data, columns=["Sex", "Embarked"])
test_data = pd.get_dummies(test_data, columns=["Sex", "Embarked"])

# Separate features and target
X_train = train_data.drop("Survived", axis=1)
y_train = train_data["Survived"]
X_test = test_data.drop("PassengerId", axis=1) # Remove 'PassengerId' from test data

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression(solver='liblinear') # liblinear is a good choice for small datasets
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_pred_train = model.predict(X_val)
y_pred_test = model.predict(X_test)

# Calculate the accuracy
train_accuracy = accuracy_score(y_val, y_pred_train)
test_accuracy = accuracy_score(y_train, model.predict(X_train))

# Print the accuracy scores
print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

## Section 4: Model Evaluation

### **Task 4**: Confusion Matrix & Classification Report

*Instruction*: Evaluate the model using confusion matrix and classification report.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Name', 'Sex', 'Embarked', 'Cabin']

label_encoders = {}
for col in categorical_cols:
    if col in X_train.columns:
        label_encoders[col] = LabelEncoder()
        label_encoders[col].fit(pd.concat([X_train[col], X_test[col]]).astype(str))
        X_train[col] = label_encoders[col].transform(X_train[col].astype(str))
        X_test[col] = label_encoders[col].transform(X_test[col].astype(str))


model = LogisticRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f"Accuracy on Train Set: {accuracy_train}")
print(f"Accuracy on Test Set: {accuracy_test}")

# Confusion Matrix and Classification Report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

NameError: name 'X_train' is not defined

## Section 5: Try Another Model

### **Task 5**:  Random Forest Classifier

*Instruction*: Train a `RandomForestClassifier` and compare its performance with Logistic Regression.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Corrected URL for the dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_data = pd.read_csv(url)

# Drop rows with missing 'Survived' values
titanic_data = titanic_data.dropna(subset=['Survived'])

# Features and target variable
X = titanic_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = titanic_data['Survived']

# Encode 'Sex' column
X.loc[:, 'Sex'] = X['Sex'].map({'female': 0, 'male': 1})

# Fill missing 'Age' values with the median
X.loc[:, 'Age'].fillna(X['Age'].median(), inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

# Sample prediction
sample = X_test.iloc[0:1]  # Keep as DataFrame to match model input format
prediction = rf_classifier.predict(sample)

# Retrieve and display the sample
sample_dict = sample.iloc[0].to_dict()
print(f"\nSample Passenger: {sample_dict}")
print(f"Predicted Survival: {'Survived' if prediction[0] == 1 else 'Did Not Survive'}")



Accuracy: 0.80

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Sample Passenger: {'Pclass': 3, 'Sex': 1, 'Age': 28.0, 'SibSp': 1, 'Parch': 1, 'Fare': 15.2458}
Predicted Survival: Did Not Survive


## Section 6: Model Tuning

### **Task 6**: Hyperparameter Tuning (GridSearch)

*Instruction*:Use `GridSearchCV` to tune `n_estimators` and `max_depth` of the Random Forest model.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Name', 'Sex', 'Embarked', 'Cabin']

label_encoders = {}
for col in categorical_cols:
    if col in X_train.columns:
        label_encoders[col] = LabelEncoder()
        label_encoders[col].fit(pd.concat([X_train[col], X_test[col]]).astype(str))
        X_train[col] = label_encoders[col].transform(X_train[col].astype(str))
        X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Try different numbers of trees
    'max_depth': [5, 10, 15]  # Try different maximum depths of trees
}

# Create a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the best model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of the best Random Forest model: {accuracy_rf}")
print(classification_report(y_test, y_pred_rf))


Best Hyperparameters: {'max_depth': 10, 'n_estimators': 200}
Accuracy of the best Random Forest model: 0.7808988764044944
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       111
           1       0.79      0.57      0.66        67

    accuracy                           0.78       178
   macro avg       0.78      0.74      0.75       178
weighted avg       0.78      0.78      0.77       178

