In [1]:
import pandas as pd
df = pd.read_csv('tahkeer_data_cleaned.csv', sep=',')

[[0.56516688 0.92705525 0.28571429 ... 0.61111111 0.56427143 1.        ]
 [0.63358586 0.49312747 0.35714286 ... 0.73333333 0.65321251 0.        ]
 [0.71162294 0.81947715 0.5        ... 0.42222222 0.65321251 1.        ]
 ...
 [0.38907563 0.44179874 0.14285714 ... 0.37777778 0.39794001 0.        ]
 [0.63358586 0.74632457 0.5        ... 0.45555556 0.56427143 1.        ]
 [0.46470946 0.61042446 0.07142857 ... 0.5        0.54406804 0.        ]]


In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

class BaggingClassifier:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth  
        self.models = [DecisionTreeClassifier(max_depth=self.max_depth) for _ in range(n_estimators)]  
    def fit(self, X, y):
        for model in self.models:
            indices = np.random.choice(len(X), len(X), replace=True)
            X_subset, y_subset = X.iloc[indices], y.iloc[indices] 
            model.fit(X_subset, y_subset)
    def predict(self, X, threshold=0.5):
        predictions = np.zeros((len(X), self.n_estimators))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        avg_predictions = np.mean(predictions, axis=1)
        binary_predictions = (avg_predictions >= threshold).astype(int)
        return binary_predictions

In [3]:
columns = df.columns.tolist()[1:]
print(columns)
columns.remove("smoking")
print(columns)
x=df[columns]
y=df["smoking"]

['triglyceride', 'weight(kg)', 'serum creatinine', 'waist(cm)', 'LDL', 'Cholesterol', 'HDL', 'fasting blood sugar', 'systolic', 'AST', 'smoking']
['triglyceride', 'weight(kg)', 'serum creatinine', 'waist(cm)', 'LDL', 'Cholesterol', 'HDL', 'fasting blood sugar', 'systolic', 'AST']


In [4]:
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,shuffle=False,train_size=0.70)
bagging_model = BaggingClassifier(n_estimators=100, max_depth=60)
bagging_model.fit(xtrain, ytrain)

In [5]:
from sklearn.metrics import accuracy_score

bagging_predictions = bagging_model.predict(xtest)
bagging_accuracy = accuracy_score(ytest, bagging_predictions)
print(f"Bagging Accuracy: {bagging_accuracy}")

Bagging Accuracy: 0.712570107462431


In [None]:
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.alphas = []
        self.models = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            model = DecisionStump()
            error = 0.0
            for feature in range(n_features):
                threshold_values = np.unique(X[:, feature])
                for threshold in threshold_values:
                    pred = model.predict(X, feature, threshold)
                    incorrect = pred != y
                    weighted_error = np.sum(weights[incorrect])
                    if weighted_error < error or error == 0.0:
                        error = weighted_error
                        model.feature_index = feature
                        model.threshold = threshold
                        model.prediction = pred.copy()

            alpha = 0.5 * np.log((1.0 - error) / max(error, 1e-10))
            weights *= np.exp(-alpha * y * model.predict(X))

            normalization_factor = np.sum(weights)
            weights /= normalization_factor

            self.alphas.append(alpha)
            self.models.append(model)

    def predict(self, X):
        X = np.array(X)
        predictions = np.array([model.predict(X) for model in self.models])
        return np.sign(np.dot(self.alphas, predictions))


class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.prediction = None

    def predict(self, X, feature=None, threshold=None):
        if feature is not None and threshold is not None:
            return np.where(X[:, feature] < threshold, -1, 1)
        else:
            return self.prediction

In [7]:
adaboost = AdaBoost(n_estimators=50)
adaboost.fit(xtrain, ytrain)

# Make predictions on the test set
predictions = adaboost.predict(xtest)

print(predictions)

# Evaluate accuracy
accuracy = accuracy_score(ytest, predictions)
print(f'Accuracy: {accuracy}')

[-1. -1. -1. ... -1. -1. -1.]


ValueError: Found input variables with inconsistent numbers of samples: [46714, 108999]

In [None]:
class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.models = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape
        for _ in range(self.n_estimators):
            # Randomly select a subset of features
            selected_features = np.random.choice(n_features, size=int(np.sqrt(n_features)), replace=False)
            X_subset = X[:, selected_features]

            # Create a decision tree with random features
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf
            )
            tree.fit(X_subset, y)
            self.models.append((tree, selected_features))

    def predict(self, X):
        X = np.array(X)
        predictions = np.zeros((X.shape[0], self.n_estimators))
        for i, (tree, selected_features) in enumerate(self.models):
            X_subset = X[:, selected_features]
            predictions[:, i] = tree.predict(X_subset)

        # Use majority voting for the final prediction
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=1, arr=predictions)
        return final_predictions



In [None]:
random_forest = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, min_samples_leaf=1)
random_forest.fit(xtrain, ytrain)

# Make predictions on the test set
predictions = random_forest.predict(xtest)

# Evaluate accuracy
accuracy = accuracy_score(ytest, predictions)
print(f'Accuracy: {accuracy}')