Title: Model Selection

Task 1: Linear Regression on House Prices<br>
Use Linear Regression and evaluate its performance on the validation set.

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Example: create synthetic dataset (replace with your real dataset)
# Features: size (sqft), number of bedrooms
data = {
    'size_sqft': [1500, 1800, 2400, 3000, 3500],
    'bedrooms': [3, 4, 3, 5, 4],
    'price': [400000, 500000, 600000, 650000, 700000]
}
df = pd.DataFrame(data)

# Split into features and target
X = df[['size_sqft', 'bedrooms']]
y = df['price']

# Split into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate performance
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Validation Mean Squared Error: {mse:.2f}")
print(f"Validation R^2 Score: {r2:.2f}")

# Optional: show predictions vs actual
results = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
print(results)


Validation Mean Squared Error: 1638172120.38
Validation R^2 Score: nan
   Actual      Predicted
1  500000  459525.660964




Task 2: Decision Tree Classifier on Iris Dataset<br>
Train a Decision Tree model and evaluate its performance on validation data.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split into train and validation sets (80% train, 20% val)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on validation set
y_pred = clf.predict(X_val)

# Evaluate performance
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=iris.target_names))


Validation Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Task 3:  Random Forest on Customer Churn<br>
Apply Random Forest and assess its accuracy on the validation set.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Example synthetic customer churn dataset
data = {
    'age': [25, 45, 35, 50, 23, 40, 60, 48, 33, 36],
    'monthly_charges': [70, 90, 80, 75, 60, 85, 95, 77, 66, 73],
    'tenure_months': [12, 24, 18, 36, 8, 15, 40, 22, 10, 13],
    'churn': [0, 1, 0, 0, 1, 0, 1, 0, 0, 1]  # 0 = no churn, 1 = churn
}

df = pd.DataFrame(data)

# Features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on validation set
y_pred = rf.predict(X_val)

# Evaluate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.33

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.25      0.25      0.25         3
weighted avg       0.33      0.33      0.33         3

