In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import multiprocessing

In [27]:
df = pd.read_csv('diabetes.csv')
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [28]:
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def _sigmoid(self, z):
        """Sigmoid activation function."""
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        """Train the Logistic Regression model."""
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations):
            # Linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # Apply sigmoid function
            predictions = self._sigmoid(linear_model)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (predictions - y))
            db = (1 / n_samples) * np.sum(predictions - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict_proba(self, X):
        """Return predicted probabilities."""
        linear_model = np.dot(X, self.weights) + self.bias
        return self._sigmoid(linear_model)

    def predict(self, X):
        """Return class predictions."""
        probabilities = self.predict_proba(X)
        return (probabilities >= 0.5).astype(int)




# Generate synthetic dataset
X, y = make_classification(n_samples=1000, n_features=8, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI','DiabetesPedigreeFunction','Age'])
df['Outcome'] = y

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1].values, df['Outcome'].values, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegressionScratch(learning_rate=0.1, n_iterations=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = np.mean(y_pred == y_test) * 100
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 87.50%


In [29]:
class KNN:
    def __init__(self, k=4):
        self.k = k

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, x1, x2):
        """Calculate the Euclidean distance between two points."""
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X):
        """Predict the class for each sample in X."""
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        """Predict the class for a single sample."""
        # Compute distances between x and all training samples
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]

        # Sort by distance and get the indices of k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Retrieve the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Return the most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


# Create a dataset with 2 classes
X, y = make_classification(n_samples=100, n_features=8, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI','DiabetesPedigreeFunction','Age'])
df['Outcome'] = y

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1].values, df['Outcome'].values, test_size=0.2, random_state=42)

# Instantiate and train the KNN model
knn = KNN()
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Model Accuracy: {accuracy:.2f}")


KNN Model Accuracy: 0.90


In [30]:
skf = StratifiedKFold(n_splits=5)
accuracies = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knn = KNN()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracies.append(np.mean(y_pred == y_test))

print(f"Cross-validated Accuracy: {np.mean(accuracies):.2f}")


Cross-validated Accuracy: 0.95


In [31]:
#implementing naive bayes
class NaiveBayes:
    def fit(self, X, y):
        """Fit the Naive Bayes model."""
        self.classes = np.unique(y)
        self.priors = {c: np.mean(y == c) for c in self.classes}
        self.likelihoods = {}

        for c in self.classes:
            X_c = X[y == c]
            self.likelihoods[c] = {
                "mean": X_c.mean(axis=0),
                "var": X_c.var(axis=0),
            }

    def _gaussian_probability(self, x, mean, var):
        """Calculate Gaussian probability density."""
        eps = 1e-6  # To avoid division by zero
        coeff = 1 / np.sqrt(2 * np.pi * (var + eps))
        exponent = np.exp(-((x - mean) ** 2) / (2 * (var + eps)))
        return coeff * exponent

    def _calculate_posterior(self, x):
        """Calculate posterior probabilities for each class."""
        posteriors = {}
        for c in self.classes:
            prior = self.priors[c]
            likelihood = np.prod(
                self._gaussian_probability(x, self.likelihoods[c]["mean"], self.likelihoods[c]["var"])
            )
            posteriors[c] = prior * likelihood
        return posteriors

    def predict(self, X):
        """Predict the class labels."""
        predictions = []
        for x in X:
            posteriors = self._calculate_posterior(x)
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)




X, y = make_classification(n_samples=500, n_features=8, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI','DiabetesPedigreeFunction','Age'])
df["Outcome"] = y

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, :-1].values, df["Outcome"].values, test_size=0.2, random_state=42
)

# Train the Naive Bayes model
nb = NaiveBayes()
nb.fit(X_train, y_train)

# Make predictions
y_pred = nb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Model Accuracy: {accuracy:.2f}")


Naive Bayes Model Accuracy: 0.91


In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
from joblib import Parallel, delayed

class AdvancedXGBoost:
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3, gamma=0, lambda_=1, alpha=0):
        self.learning_rate = learning_rate  # Learning rate
        self.n_estimators = n_estimators  # Number of trees
        self.max_depth = max_depth  # Depth of decision trees
        self.gamma = gamma  # Regularization term for tree complexity
        self.lambda_ = lambda_  # L2 regularization on leaf weights (Ridge)
        self.alpha = alpha  # L1 regularization on leaf weights (Lasso)
        self.models = []  # List to store individual trees
        self.y_mean = None  # Initial mean value

    def fit(self, X, y):
        """Fit the model to the training data."""
        # Initialize predictions with a simple constant value (mean of target values)
        self.y_mean = np.mean(y)
        predictions = np.full_like(y, self.y_mean, dtype=np.float32)

        for _ in range(self.n_estimators):
            # Compute residuals (difference between actual and predicted)
            residuals = y - predictions

            # Train a decision tree on the residuals
            model = DecisionTreeClassifier(max_depth=self.max_depth)
            model.fit(X, residuals)

            # Predict the residuals with the trained tree
            residual_preds = model.predict(X)

            # Regularization: Modify the residual predictions with the regularization terms
            residual_preds = self._apply_regularization(residual_preds)

            # Update the model's predictions
            predictions += self.learning_rate * residual_preds

            # Save the model
            self.models.append(model)

    def _apply_regularization(self, residual_preds):
        """Apply L1 and L2 regularization on the leaf values."""
        # L1 regularization (Lasso)
        residual_preds = np.sign(residual_preds) * np.maximum(np.abs(residual_preds) - self.alpha, 0)

        # L2 regularization (Ridge)
        residual_preds -= self.lambda_ * residual_preds
        return residual_preds

    def predict(self, X):
        """Make predictions using the fitted model."""
        predictions = np.full(X.shape[0], self.y_mean, dtype=np.float32)

        # Add the predictions from each tree (scaled by the learning rate)
        for model in self.models:
            residual_preds = model.predict(X)
            residual_preds = self._apply_regularization(residual_preds)
            predictions += self.learning_rate * residual_preds

        return np.round(predictions).astype(int)  # Round to get binary class predictions

    def _parallel_tree_building(self, X, y, start_idx, end_idx):
        """Parallelize tree building (simulated)"""
        model = DecisionTreeClassifier(max_depth=self.max_depth)
        model.fit(X[start_idx:end_idx], y[start_idx:end_idx])
        return model

    def fit_parallel(self, X, y):
        """Fit the model to the training data using parallelism."""
        # Initialize predictions with a simple constant value (mean of target values)
        self.y_mean = np.mean(y)
        predictions = np.full_like(y, self.y_mean, dtype=np.float32)

        # Use parallelism to fit trees
        chunk_size = len(X) // self.n_estimators
        results = Parallel(n_jobs=-1)(delayed(self._parallel_tree_building)(X, y, i*chunk_size, (i+1)*chunk_size) for i in range(self.n_estimators))

        for model in results:
            # Compute residuals (difference between actual and predicted)
            residuals = y - predictions

            # Predict the residuals with the trained tree
            residual_preds = model.predict(X)

            # Regularization: Modify the residual predictions with the regularization terms
            residual_preds = self._apply_regularization(residual_preds)

            # Update the model's predictions
            predictions += self.learning_rate * residual_preds

            # Save the model
            self.models.append(model)


# Generate synthetic dataset for classification
X, y = make_classification(n_samples=500, n_features=8, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI','DiabetesPedigreeFunction','Age'])
df["Outcome"] = y

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1].values, df["Outcome"].values, test_size=0.2, random_state=42)

# Ensure y_train is an integer type
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Instantiate and train the Advanced XGBoost model with regularization and parallelism
model = AdvancedXGBoost(learning_rate=0.1, n_estimators=100, max_depth=3, gamma=0, lambda_=1, alpha=0)
model.fit_parallel(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Advanced XGBoost Model Accuracy: {accuracy:.2f}")


Advanced XGBoost Model Accuracy: 0.49
