In [5]:
import pandas as pd
df = pd.read_csv('tahkeer_data_cleaned.csv', sep=',')

In [6]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class BaggingClassifier:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth  
        self.models = [DecisionTreeClassifier(max_depth=self.max_depth) for _ in range(n_estimators)] 
        
        
    def fit(self, x, y):
        for model in self.models:
            indices = np.random.choice(len(x), len(x), replace=True)
            x_subset, y_subset = x.iloc[indices], y.iloc[indices] 
            model.fit(x_subset, y_subset)
            

    def predict(self, x, threshold=0.5):
        pred = np.zeros((len(x), self.n_estimators))
        for i, model in enumerate(self.models):
            pred[:, i] = model.predict(x)
        avg_predictions = np.mean(pred, axis=1)
        binary_predictions = (avg_predictions >= threshold).astype(int)
        return binary_predictions

In [7]:
columns = df.columns.tolist()
columns.remove("smoking")
features_x = df[columns]
class_y = df["smoking"]

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

xtrain, xtest, ytrain, ytest = train_test_split(features_x, class_y, test_size=0.30, shuffle=False, train_size=0.70)

In [9]:
bagging_model = BaggingClassifier(n_estimators=100, max_depth=60)
bagging_model.fit(xtrain, ytrain)

bagging_predictions = bagging_model.predict(xtest)
bagging_accuracy = accuracy_score(ytest, bagging_predictions)
print(f"Bagging Accuracy: {bagging_accuracy}")

Bagging Accuracy: 0.7319647214967676


In [16]:
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.alphas = []
        self.models = []

    def fit(self, x, y):
        x = np.array(x)
        y = np.array(y)
        n_samples, n_features = x.shape
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            model = DecisionStump()
            error = float('inf')
            for feature in range(n_features):
                threshold_values = np.unique(x[:, feature])
                for threshold in threshold_values:
                    pred = model.predict(x, feature, threshold)
                    incorrect = pred != y
                    weighted_error = np.sum(weights[incorrect])
                    if weighted_error < error:
                        error = weighted_error
                        model.feature_index = feature
                        model.threshold = threshold
                        model.prediction = pred.copy()

            if error >= 0.5:
                break

            alpha = 0.5 * np.log((1.0 - error) / max(error, 1e-10))
            self.alphas.append(alpha)
            self.models.append(model)

            # Update weights
            exponent = -alpha * y * model.predict(x, model.feature_index, model.threshold)
            weights *= np.exp(exponent)
            weights /= np.sum(weights)

    def predict(self, x):
        x = np.array(x)
        pred = np.zeros(len(x))
        for alpha, model in zip(self.alphas, self.models):
            pred += alpha * model.predict(x, model.feature_index, model.threshold)
        return np.sign(pred)


class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.prediction = None

    def predict(self, x, feature=None, threshold=None):
        if feature is not None and threshold is not None:
            return np.where(x[:, feature] < threshold, -1, 1)
        else:
            return self.prediction

In [17]:
adaboost = AdaBoost(n_estimators=300)
adaboost.fit(xtrain, ytrain)

predictions = adaboost.predict(xtest)

accuracy = accuracy_score(ytest, predictions)
print(f'Accuracy: {accuracy}')

TypeError: predict() takes from 2 to 3 positional arguments but 4 were given

In [12]:
class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.models = []

    def fit(self, x, y):
        x = np.array(x)
        y = np.array(y)
        n_samples, n_features = x.shape
        for _ in range(self.n_estimators):
            # Randomly select a subset of features
            selected_features = np.random.choice(n_features, size=int(np.sqrt(n_features)), replace=False)
            x_subset = x[:, selected_features]

            # Create a decision tree with random features
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf
            )
            tree.fit(x_subset, y)
            self.models.append((tree, selected_features))

    def predict(self, x):
        x = np.array(x)
        predictions = np.zeros((x.shape[0], self.n_estimators))
        for i, (tree, selected_features) in enumerate(self.models):
            x_subset = x[:, selected_features]
            predictions[:, i] = tree.predict(x_subset)

        # Use majority voting for the final prediction
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=1, arr=predictions)
        return final_predictions



In [13]:
random_forest = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, min_samples_leaf=1)
random_forest.fit(xtrain, ytrain)

# Make predictions on the test set
predictions = random_forest.predict(xtest)

# Evaluate accuracy
accuracy = accuracy_score(ytest, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.696022605642848
