In [None]:
#Q1

import numpy as np, pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.utils import resample
from sklearn.feature_extraction import text as sk_text

path_to_csv = "spam.csv"
df = pd.read_csv(path_to_csv, encoding='latin-1')

if 'v1' in df.columns and 'v2' in df.columns:
    df = df.rename(columns={'v1':'label','v2':'text'})[['label','text']]
elif 'label' in df.columns and 'text' in df.columns:
    df = df[['label','text']]
else:

    cols = df.columns.tolist()
    df = df[[cols[0], cols[1]]]
    df.columns = ['label','text']


df['label'] = df['label'].map({'spam':1,'ham':0})


stopwords = sk_text.ENGLISH_STOP_WORDS
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    tokens = [w for w in s.split() if w not in stopwords]
    return " ".join(tokens)

df['text_clean'] = df['text'].apply(clean_text)


tfv = TfidfVectorizer(max_features=5000)
X = tfv.fit_transform(df['text_clean'])
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


print("Train class dist:", np.bincount(y_train))
print("Test class dist :", np.bincount(y_test))

stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(X_train, y_train)
y_pred_train = stump.predict(X_train); y_pred_test = stump.predict(X_test)
print("Stump Train Acc:", accuracy_score(y_train, y_pred_train))
print("Stump Test Acc :", accuracy_score(y_test, y_pred_test))
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_pred_test))
print("Class report (test):\n", classification_report(y_test, y_pred_test))

T = 15
n_train = X_train.shape[0]
w = np.ones(n_train) / n_train
models = []
alphas = []
weighted_errors = []

for t in range(1, T+1):
    stump_t = DecisionTreeClassifier(max_depth=1, random_state=42)
    stump_t.fit(X_train, y_train, sample_weight=w)
    pred = stump_t.predict(X_train)

    miss = (pred != y_train).astype(int)

    eps = np.sum(w * miss) / np.sum(w)

    eps = np.clip(eps, 1e-12, 1-1e-12)
    alpha = 0.5 * np.log((1 - eps) / eps)

    y_signed = np.where(y_train==1, 1, -1)
    pred_signed = np.where(pred==1, 1, -1)
    w = w * np.exp(-alpha * y_signed * pred_signed)
    w = w / np.sum(w)
    models.append(stump_t); alphas.append(alpha); weighted_errors.append(eps)

    mis_idx = np.where(miss==1)[0]
    print(f"Iter {t} â€” eps={eps:.5f}, alpha={alpha:.5f}, #mis={len(mis_idx)}")
    print("First 10 misclassified train indices:", mis_idx[:10])
    print("Weights of first 10 misclassified examples:", w[mis_idx[:10]])
    print("----")

def ada_predict(models, alphas, X):

    agg = None
    for m,a in zip(models, alphas):
        p = m.predict(X)
        p_signed = np.where(p==1, 1, -1)
        if agg is None:
            agg = a * p_signed
        else:
            agg += a * p_signed
    return (np.sign(agg) > 0).astype(int)

y_train_pred = ada_predict(models, alphas, X_train)
y_test_pred = ada_predict(models, alphas, X_test)
print("Manual Ada: Train Acc:", accuracy_score(y_train, y_train_pred))
print("Manual Ada: Test  Acc:", accuracy_score(y_test, y_test_pred))
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_test_pred))


plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.plot(range(1,T+1), weighted_errors, marker='o'); plt.title("Iteration vs Weighted error")
plt.subplot(1,2,2); plt.plot(range(1,T+1), alphas, marker='o'); plt.title("Iteration vs Alpha")
plt.tight_layout(); plt.show()


adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.6, random_state=42)
adb.fit(X_train, y_train)
y_train_adb = adb.predict(X_train); y_test_adb = adb.predict(X_test)
print("Sklearn AdaBoost Train Acc:", accuracy_score(y_train, y_train_adb))
print("Sklearn AdaBoost Test  Acc:", accuracy_score(y_test, y_test_adb))
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_test_adb))


In [None]:
#Q2

import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import fetch_openml

try:
    heart = fetch_openml(name='heart', version=1, as_frame=True)
    X = heart.data
    y = heart.target.astype(int)
    print("Loaded from OpenML:", heart.DESCR[:200])
except Exception as e:
    print("OpenML load failed:", e)

X = X.copy()
for col in X.columns:
    if X[col].dtype == object:

        X[col] = pd.to_numeric(X[col], errors='ignore')


categorical_cols = [c for c in X.columns if X[c].nunique() <= 6 and X[c].dtype in [np.int64, np.int32, np.object_]]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)


preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

stump_pipe = Pipeline([('pre', preprocessor), ('clf', DecisionTreeClassifier(max_depth=1, random_state=42))])
stump_pipe.fit(X_train, y_train)
y_tr_pred = stump_pipe.predict(X_train); y_te_pred = stump_pipe.predict(X_test)
print("Stump Train Acc:", accuracy_score(y_train, y_tr_pred))
print("Stump Test  Acc:", accuracy_score(y_test, y_te_pred))
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_te_pred))
print("Classification report (test):\n", classification_report(y_test, y_te_pred))

n_estimators_list = [5,10,25,50,100]
learning_rates = [0.1, 0.5, 1.0]
results = []
for lr in learning_rates:
    accs = []
    for n in n_estimators_list:
        adb_pipe = Pipeline([('pre', preprocessor),
                             ('clf', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                                                        n_estimators=n, learning_rate=lr, random_state=42))])
        adb_pipe.fit(X_train, y_train)
        y_pred = adb_pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accs.append(acc)
        results.append({'lr': lr, 'n': n, 'acc': acc})
    plt.plot(n_estimators_list, accs, marker='o', label=f"lr={lr}")

plt.xlabel("n_estimators"); plt.ylabel("Test accuracy"); plt.legend(); plt.title("AdaBoost: n_estimators vs accuracy"); plt.grid(True); plt.show()

res_df = pd.DataFrame(results)
best_row = res_df.loc[res_df['acc'].idxmax()]
print("Best config:", best_row.to_dict())

preproc = preprocessor.fit(X_train)
X_train_proc = preproc.transform(X_train)
X_test_proc  = preproc.transform(X_test)

def manual_adaboost_track(X, y, T=50):
    n = X.shape[0]
    w = np.ones(n)/n
    models, alphas, errors = [], [], []
    for t in range(T):
        stump = DecisionTreeClassifier(max_depth=1, random_state=42)
        stump.fit(X, y, sample_weight=w)
        pred = stump.predict(X)
        miss = (pred != y).astype(int)
        eps = np.sum(w * miss)
        eps = np.clip(eps, 1e-12, 1-1e-12)
        alpha = 0.5 * np.log((1 - eps)/eps)

        y_signed = np.where(y==1, 1, -1)
        pred_signed = np.where(pred==1, 1, -1)
        w = w * np.exp(-alpha * y_signed * pred_signed)
        w = w / np.sum(w)
        models.append(stump); alphas.append(alpha); errors.append(eps)
    return models, alphas, errors, w

T_best = int(best_row['n'])
models_b, alphas_b, errors_b, final_weights = manual_adaboost_track(X_train_proc, y_train.values, T=T_best)

plt.figure(); plt.plot(range(1,T_best+1), errors_b, marker='o'); plt.xlabel('Iteration'); plt.ylabel('Weak learner error'); plt.title('Weak learner error vs iteration'); plt.grid(True)

plt.figure(); plt.hist(final_weights, bins=30); plt.title('Final sample weight distribution'); plt.show()

top_idx = np.argsort(final_weights)[-10:][::-1]
print("Top weighted training samples (indices):", top_idx)
print("Their labels:", y_train.values[top_idx])

adb_best = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=T_best, learning_rate=best_row['lr'], random_state=42)
adb_best.fit(X_train_proc, y_train)
feat_importances = adb_best.feature_importances_

ohe = preproc.named_transformers_['cat']
ohe_cat_names = []
if hasattr(ohe, 'get_feature_names_out'):
    ohe_cat_names = list(preproc.transformers_[1][1].get_feature_names_out(categorical_cols))
else:
    try:
        ohe_cat_names = list(ohe.get_feature_names(categorical_cols))
    except:
        ohe_cat_names = ["cat_{}".format(i) for i in range(sum(len(vals.categories_[i]) for i in range(len(categorical_cols))))]
num_names = numeric_cols
feature_names = list(num_names) + ohe_cat_names
# Top 5 features
top5_idx = np.argsort(feat_importances)[-5:][::-1]
print("Top 5 important features and importances:")
for i in top5_idx:
    print(feature_names[i], feat_importances[i])


In [None]:
#Q3

import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

path = "WISDM_ar_v1.1_raw.txt"
rows = []
with open(path, 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        parts = re.split(r'[,\s]+', line)

        if len(parts) < 6:
            continue
        user, activity, timestamp, x, y, z = parts[0], parts[1], parts[2], parts[3], parts[4], parts[5]
        try:
            rows.append([int(user), activity, int(timestamp), float(x), float(y), float(z)])
        except:
            continue

df = pd.DataFrame(rows, columns=['user','activity','timestamp','x','y','z'])

vigorous = {'Jogging','Upstairs','Jogging,','Upstairs,'}
df['activity_norm'] = df['activity'].str.strip().str.lower()

df['label'] = df['activity_norm'].apply(lambda s: 1 if ('jog' in s) or ('up' in s) else 0)

df = df.dropna(subset=['x','y','z'])

X = df[['x','y','z']].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(X_train, y_train)
print("Stump train acc", accuracy_score(y_train, stump.predict(X_train)))
print("Stump test  acc", accuracy_score(y_test, stump.predict(X_test)))
print("Confusion (test):\n", confusion_matrix(y_test, stump.predict(X_test)))

def ada_manual(X_tr, y_tr, X_te, T=20):
    n = X_tr.shape[0]
    w = np.ones(n)/n
    models, alphas = [], []
    for t in range(T):
        stump = DecisionTreeClassifier(max_depth=1, random_state=42)
        stump.fit(X_tr, y_tr, sample_weight=w)
        pred = stump.predict(X_tr)
        miss = (pred != y_tr).astype(int)
        eps = np.sum(w * miss)
        eps = np.clip(eps, 1e-12, 1-1e-12)
        alpha = 0.5*np.log((1-eps)/eps)
        y_signed = np.where(y_tr==1, 1, -1); pred_signed = np.where(pred==1,1,-1)
        w = w * np.exp(-alpha*y_signed*pred_signed); w = w/np.sum(w)
        models.append(stump); alphas.append(alpha)

        print(f"Iter {t+1}: eps={eps:.4f}, alpha={alpha:.4f}, #mis={(miss==1).sum()}")

    def predict_combo(X):
        agg = np.zeros(X.shape[0])
        for m,a in zip(models, alphas):
            agg += a * (2*m.predict(X)-1)
        return (np.sign(agg)>0).astype(int)
    return models, alphas, predict_combo

models_a, alphas_a, predict_fun = ada_manual(X_train, y_train, X_test, T=20)
y_train_pred = predict_fun(X_train); y_test_pred = predict_fun(X_test)
print("Manual Ada: train acc", accuracy_score(y_train, y_train_pred))
print("Manual Ada: test  acc", accuracy_score(y_test, y_test_pred))
print("Confusion test:\n", confusion_matrix(y_test, y_test_pred))

adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=1.0, random_state=42)
adb.fit(X_train, y_train)
print("Sklearn Ada train acc", accuracy_score(y_train, adb.predict(X_train)))
print("Sklearn Ada test acc", accuracy_score(y_test, adb.predict(X_test)))
print("Confusion test:\n", confusion_matrix(y_test, adb.predict(X_test)))
