In [125]:
%store -r new_def

In [126]:
X=new_def.drop(['smoking'],axis=1)
y=new_def['smoking']


In [127]:
X.head()


Unnamed: 0,waist_winsorized,Gtp_winsorized,triglyceride_winsorized,systolic_winsorized,hemoglobin_winsorized,height_winsorized
0,1.655196,1.27115,1.570003,1.734031,1.279615,1.805236
1,1.70268,1.70268,1.805236,1.777095,1.324422,1.805236
2,1.70268,1.474144,1.808267,1.768118,1.331049,1.819777
3,1.668474,1.27115,1.607232,1.766789,1.332673,1.819777
4,1.700826,1.516302,1.607232,1.77074,1.321028,1.815096


In [128]:
X.shape

(101923, 6)

In [129]:
y.head()

0    0
1    0
2    1
3    1
4    1
Name: smoking, dtype: int64

In [130]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class BaggingClassifier:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth  
        self.models = [DecisionTreeClassifier(max_depth=self.max_depth) for _ in range(n_estimators)]
        
    def fit(self, X, y):
        for model in self.models:
            indices = np.random.choice(len(X), len(X), replace=True)
            X_subset, y_subset = X.iloc[indices], y.iloc[indices] 
            model.fit(X_subset, y_subset)
    
    def predict(self, X, threshold=0.5):
        predictions = np.zeros((len(X), self.n_estimators))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)

        # Calculate the average prediction across all models
        avg_predictions = np.mean(predictions, axis=1)
    
        # Apply threshold to convert to binary predictions
        binary_predictions = (avg_predictions >= threshold).astype(int)
    
        return binary_predictions


In [131]:
class AdaBoostClassifier:
    def __init__(self, n_estimators=100, max_depth=1):
        self.n_estimators = n_estimators
        self.models = []
        self.alphas = []
        self.max_depth = max_depth  
        
    def fit(self, X, y):
        weights = np.ones(len(X)) / len(X)
        
        for _ in range(self.n_estimators):
            model = DecisionTreeClassifier(max_depth=self.max_depth)  
            model.fit(X, y, sample_weight=weights)
            predictions = model.predict(X)
            
            error = np.sum(weights * (predictions != y))
            alpha = 0.5 * np.log((1 - error) / max(error, 1e-10))
            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)
            
            self.models.append(model)
            self.alphas.append(alpha)
    
    def predict(self, X):
        predictions = np.zeros(len(X))
        for model, alpha in zip(self.models, self.alphas):
            predictions += alpha * model.predict(X)
        return np.sign(predictions)


In [132]:
class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_features=None, max_depth=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth  
        self.models = [DecisionTreeClassifier(max_features=max_features, max_depth=max_depth) for _ in range(n_estimators)]
        
    def fit(self, X, y):
        for model in self.models:
            indices = np.random.choice(len(X), len(X), replace=True)
            X_subset, y_subset = X.iloc[indices], y.iloc[indices]
            model.fit(X_subset, y_subset)
    
    def predict(self, X, threshold=0.5):
        predictions = np.zeros((len(X), self.n_estimators))
        for i, model in enumerate(self.models):
             predictions[:, i] = model.predict(X)
    
        # Calculate the average prediction across all models
        avg_predictions = np.mean(predictions, axis=1)
    
        # Apply threshold to convert to binary predictions
        binary_predictions = (avg_predictions >= threshold).astype(int)
    
        return binary_predictions


In [133]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and fit each model
bagging_model = BaggingClassifier(n_estimators=100, max_depth=1)
bagging_model.fit(X_train, y_train)

boosting_model = AdaBoostClassifier(n_estimators=100, max_depth=1)
boosting_model.fit(X_train, y_train)

random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=1)
random_forest_model.fit(X_train, y_train)

# Make predictions
bagging_predictions = bagging_model.predict(X_test)
boosting_predictions = boosting_model.predict(X_test)
random_forest_predictions = random_forest_model.predict(X_test)

# Evaluate the models
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)

print(f"Bagging Accuracy: {bagging_accuracy}")
print(f"Boosting Accuracy: {boosting_accuracy}")
print(f"Random Forest Accuracy: {random_forest_accuracy}")


Bagging Accuracy: 0.7038999264164827
Boosting Accuracy: 0.6875153298994359
Random Forest Accuracy: 0.7038999264164827
