# Task 3: Introduction to Machine Learning

## Section 1: Setup & Dataset

### **Task 1**: Load the Dataset

*Instruction*: Load the preprocessed Titanic dataset (from the previous module or load again if needed). Separate it into features (`X`) and target (`y`, where target = `Survived`).

In [None]:
import pandas as pd

df = pd.read_csv('titanic.csv')
X = df.drop('Survived', axis=1)
y = df['Survived']
print()

FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'

## Section 2: Splitting the Data

### **Task 2**: Train/Test Split

*Instruction*:

Split the dataset into training and testing sets (80/20 split).


In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your feature set and y is your labels/target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'X' is not defined

## Section 3: Train Your First Model

### **Task 3**: Logistic Regression

*Instruction*: Train a Logistic Regression model on the Titanic dataset. Display accuracy on both train and test sets.



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Select relevant features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# Drop rows with missing values in selected features
df = df[features + [target]].dropna()

# Encode categorical variables
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

# Split data into features and target
X = df[features]
y = df[target]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate accuracy
train_accuracy = accuracy_score(y_train, model.predict(X_train))
test_accuracy = accuracy_score(y_test, model.predict(X_test))

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Training Accuracy: 0.81195079086116
Test Accuracy: 0.7972027972027972


## Section 4: Model Evaluation

### **Task 4**: Confusion Matrix & Classification Report

*Instruction*: Evaluate the model using confusion matrix and classification report.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Sample data (replace with your actual data)
y_true = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])  # Actual labels
y_pred = np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1])  # Predicted labels

# 1. Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)

# 2. Classification Report
report = classification_report(y_true, y_pred)
print("\nClassification Report:")
print(report)

Confusion Matrix:
[[3 2]
 [1 4]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.67      0.80      0.73         5

    accuracy                           0.70        10
   macro avg       0.71      0.70      0.70        10
weighted avg       0.71      0.70      0.70        10



## Section 5: Try Another Model

### **Task 5**:  Random Forest Classifier

*Instruction*: Train a `RandomForestClassifier` and compare its performance with Logistic Regression.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset (replace with your actual data)
data = {'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'feature2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        'target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data)
X = df[['feature1', 'feature2']]
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Train and evaluate RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# 2. Train and evaluate Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# 3. Compare the models
rf_accuracy = accuracy_score(y_test, rf_predictions)
lr_accuracy = accuracy_score(y_test, lr_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Logistic Regression Accuracy:", lr_accuracy)

# Print classification reports for both models
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))


Random Forest Accuracy: 0.0
Logistic Regression Accuracy: 0.0

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



## Section 6: Model Tuning

### **Task 6**: Hyperparameter Tuning (GridSearch)

*Instruction*:Use `GridSearchCV` to tune `n_estimators` and `max_depth` of the Random Forest model.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Select relevant features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# Drop rows with missing values in selected features
df = df[features + [target]].dropna()

# Encode categorical variables
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

# Split data into features and target
X = df[features]
y = df[target]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_cols = ['Name', 'Sex', 'Embarked', 'Cabin']

label_encoders = {}
for col in categorical_cols:
    if col in X_train.columns:
        label_encoders[col] = LabelEncoder()
        label_encoders[col].fit(pd.concat([X_train[col], X_test[col]]).astype(str))
        X_train[col] = label_encoders[col].transform(X_train[col].astype(str))
        X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Try different numbers of trees
    'max_depth': [5, 10, 15]  # Try different maximum depths of trees
}

# Create a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the best model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of the best Random Forest model: {accuracy_rf}")
print(classification_report(y_test, y_pred_rf))

Best Hyperparameters: {'max_depth': 5, 'n_estimators': 200}
Accuracy of the best Random Forest model: 0.7552447552447552
              precision    recall  f1-score   support

           0       0.73      0.89      0.80        80
           1       0.80      0.59      0.68        63

    accuracy                           0.76       143
   macro avg       0.77      0.74      0.74       143
weighted avg       0.76      0.76      0.75       143

