ques 1

part a

In [None]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

df = pd.read_csv("spam.csv", encoding="latin-1")[["v1","v2"]]
df.columns = ["label","text"]

df["label"] = df["label"].map({"ham":0, "spam":1})

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text

df["clean_text"] = df["text"].apply(preprocess)

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["clean_text"])
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print(df["label"].value_counts())


part b

In [None]:
stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)

y_pred_train = stump.predict(X_train)
y_pred_test = stump.predict(X_test)

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


part c

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

T = 15
n = X_train.shape[0]
weights = np.ones(n) / n

alphas = []
errors = []
misclassified_history = []

for t in range(T):

    stump = DecisionTreeClassifier(max_depth=1)
    stump.fit(X_train, y_train, sample_weight=weights)
    y_pred = stump.predict(X_train)

    err = np.sum(weights * (y_pred != y_train)) / np.sum(weights)
    errors.append(err)

    alpha = 0.5 * np.log((1 - err) / (err + 1e-10))
    alphas.append(alpha)

    misclassified_idx = np.where(y_pred != y_train)[0]
    misclassified_history.append(misclassified_idx)

    print(f"\nIteration {t+1}")
    print("Misclassified samples:", misclassified_idx[:10], "...") 
    print("Weights of misclassified:", weights[misclassified_idx][:10])
    print("Alpha:", alpha)

    weights = weights * np.exp(alpha * (y_pred != y_train))
    weights = weights / np.sum(weights)   

def ada_predict(X):
    final = np.zeros(X.shape[0])
    for alpha, stump in zip(alphas, [DecisionTreeClassifier(max_depth=1).fit(
            X_train, y_train, sample_weight=weights) for _ in range(T)]):
        final += alpha * stump.predict(X)
    return np.sign(final)

y_test_pred = ada_predict(X_test)

print("\nFinal Train Accuracy:", accuracy_score(y_train, ada_predict(X_train)))
print("Final Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


part d

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=0.6
)

ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)

print("Train Accuracy:", accuracy_score(y_train, ada.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


ques 2

part a

In [None]:
from sklearn.datasets import load_heart_disease   
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

data = load_heart_disease()  

X = data.data
y = data.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)

print("Train Acc:", stump.score(X_train, y_train))
print("Test Acc:", stump.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, stump.predict(X_test)))
print(classification_report(y_test, stump.predict(X_test)))


part b

In [None]:
from sklearn.ensemble import AdaBoostClassifier

estimators = [5, 10, 25, 50, 100]
rates = [0.1, 0.5, 1.0]

results = {}

for lr in rates:
    results[lr] = []
    for n in estimators:
        model = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=1),
            n_estimators=n, learning_rate=lr
        )
        model.fit(X_train, y_train)
        acc = model.score(X_test, y_test)
        results[lr].append(acc)
        print(lr, n, acc)


part c

In [None]:
errors = model.estimator_errors_
weights = model.estimator_weights_

plt.plot(errors)
plt.title("Weak Learner Error vs Iteration")

plt.figure()
plt.hist(weights)
plt.title("Final Weight Distribution")
plt.show()


part d

In [None]:
importances = model.feature_importances_
idx = np.argsort(importances)[::-1]

top5 = idx[:5]
for i in top5:
    print(data.feature_names[i], importances[i])


ques 3

part a

In [None]:
import pandas as pd

df = pd.read_csv("WISDM_ar_v1.1_raw.txt", header=None, sep=",")
df = df.dropna()

df.columns = ["user","activity","timestamp","x","y","z"]

df["label"] = df["activity"].apply(
    lambda a: 1 if a.lower() in ["jogging","upstairs"] else 0
)

X = df[["x","y","z"]].astype(float)
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


part b

In [None]:
stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)

print("Train:", stump.score(X_train, y_train))
print("Test:", stump.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, stump.predict(X_test)))


part c

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
T = 20

n = X_train.shape[0]
weights = np.ones(n) / n

alphas = []
errors = []
stumps = []

print("=== MANUAL ADABOOST (T = 20) ===")

for t in range(T):

    stump = DecisionTreeClassifier(max_depth=1)
    stump.fit(X_train, y_train, sample_weight=weights)
    stumps.append(stump)
    y_pred = stump.predict(X_train)
    err = np.sum(weights * (y_pred != y_train))
    errors.append(err)
    alpha = 0.5 * np.log((1 - err) / (err + 1e-10))
    alphas.append(alpha)
    misclassified = np.where(y_pred != y_train)[0]

    print(f"\nIteration {t+1}")
    print("Misclassified indices (first 15):", misclassified[:15])
    print("Weights of misclassified (first 15):", weights[misclassified][:15])
    print("Alpha:", alpha)

    weights = weights * np.exp(alpha * (y_pred != y_train))
    weights = weights / np.sum(weights)

def ada_predict(X):
    final = np.zeros(X.shape[0])
    for alpha, stump in zip(alphas, stumps):
        pred = stump.predict(X)
        pred = np.where(pred == 1, 1, -1) 
        final += alpha * pred
    return np.where(final >= 0, 1, 0)   
train_pred = ada_predict(X_train)
test_pred = ada_predict(X_test)

print("\n=== FINAL RESULTS ===")
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))

plt.figure(figsize=(6,4))
plt.plot(errors, marker="o")
plt.title("Boosting Round vs Weighted Error")
plt.xlabel("Iteration")
plt.ylabel("Weighted Error")
plt.grid(True)
plt.show()

plt.figure(figsize=(6,4))
plt.plot(alphas, marker="o")
plt.title("Boosting Round vs Alpha")
plt.xlabel("Iteration")
plt.ylabel("Alpha")
plt.grid(True)
plt.show()


part d

In [None]:
ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0
)
ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)

print("Train Acc:", ada.score(X_train, y_train))
print("Test Acc:", ada.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
