## Q1: SMS Spam Collection Dataset — AdaBoost

In [None]:
# Part A: Data Preprocessing & Exploration
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
df = pd.read_csv('spam.csv')  # Update with actual path
df['label'] = df['label'].map({'spam': 1, 'ham': 0})
# Text preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text
df['text'] = df['text'].apply(clean_text)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pd.Series(y).value_counts()  # Show class distribution

In [None]:
#Part B: Weak Learner Baseline (Decision Stump)
stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)
y_pred_train = stump.predict(X_train)
y_pred_test = stump.predict(X_test)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
conf = confusion_matrix(y_test, y_pred_test)
acc_train, acc_test, conf

## Part C: Manual AdaBoost (T = 15)
# (Pseudocode and structure for manual AdaBoost with weight tracking, error, alpha, and plots)
# For each iteration: print iteration, misclassified indices, weights, alpha, and update weights.
# Plot: iteration vs weighted error, iteration vs alpha.
# Report: train/test accuracy, confusion matrix, interpretation.

In [None]:
# Part D: Sklearn AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.6)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
acc_ada = accuracy_score(y_test, y_pred_ada)
conf_ada = confusion_matrix(y_test, y_pred_ada)
acc_ada, conf_ada

## Q2: UCI Heart Disease Dataset — AdaBoost

In [None]:
# Part A: Baseline Model (Decision Stump)
from sklearn.datasets import load_heart_disease
data = load_heart_disease()  # Replace with actual loading method
X, y = data.data, data.target
# Preprocess categorical features, scaling if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)
y_pred = stump.predict(X_test)
accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

# Part B: Train AdaBoost with different n_estimators and learning_rate, plot accuracy
# Part C: Track sample weights and errors for best model, plot error and weight distribution
# Part D: Visualize feature importance and discuss medically relevant features.

## Q2: WISDM Smartphone & Watch Motion Sensor Dataset — AdaBoost

In [None]:
# Data preparation: load, extract accelerometer features, create binary label, handle missing, train-test split
# Baseline: Decision stump
# Manual AdaBoost: 20 rounds, print iteration, misclassified indices, weights
# Sklearn AdaBoost: n_estimators=100, learning_rate=1.0, compare results