In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
df = pd.read_csv(r"C:\Users\sabni\Documents\BML\ObesityDataSet_raw_and_data_sinthetic.csv")
print(df.head(20))
print("\n\n\n\n")

# Drop missing values
df.dropna(axis=0, inplace=True)

# Display info and basic stats
print("DataFrame Info:")
print(df.info())
print("\n\n\n\n")

print("Descriptive Statistics:")
print(df.describe())
print("\n\n\n\n")

# Check for null values
print("Any Null Values Present:")
print(df.isnull().any())
print("\n\n\n\n")

print("Sum of Null Values per Column:")
print(df.isnull().sum())
print("\n\n\n\n")

# Check for duplicates
print("Any Duplicated Rows:")
print(df.duplicated().any())
print("\n\n\n\n")

# Drop duplicates
df.drop_duplicates(inplace=True)

# Save the cleaned DataFrame
output_path = r"C:\Users\sabni\Documents\SKILL\hi2.csv"
df.to_csv(output_path, index=False)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for inverse transformation if needed

# Define features and target
X = df.drop(columns=["NObeyesdad"])  # Features
y = df["NObeyesdad"]  # Target variable

# Split dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression(max_iter=500, multi_class='ovr', solver='lbfgs')
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

# Gaussian Naïve Bayes (for continuous data)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
report_gnb = classification_report(y_test, y_pred_gnb)
print(f"Gaussian Naïve Bayes Accuracy: {accuracy_gnb:.4f}")
print("Gaussian Naïve Bayes Classification Report:\n", report_gnb)

# Multinomial Naïve Bayes (for count-based or discrete features)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
report_mnb = classification_report(y_test, y_pred_mnb)
print(f"\nMultinomial Naïve Bayes Accuracy: {accuracy_mnb:.4f}")
print("Multinomial Naïve Bayes Classification Report:\n", report_mnb)

# Bernoulli Naïve Bayes (for binary/boolean features)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
accuracy_bnb = accuracy_score(y_test, y_pred_bnb)
report_bnb = classification_report(y_test, y_pred_bnb)
print(f"\nBernoulli Naïve Bayes Accuracy: {accuracy_bnb:.4f}")
print("Bernoulli Naïve Bayes Classification Report:\n", report_bnb)


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Ensure X_train and X_test are numeric
if isinstance(X_train, pd.DataFrame):
    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Encode categorical labels (if necessary)
if isinstance(y_train[0], str):
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

class CustomGaussianNB:
    def fit(self, X, y):
        """Fit the Gaussian Naïve Bayes model."""
        self.classes = np.unique(y)
        self.class_priors = {c: np.mean(y == c) for c in self.classes}  # P(y)
        self.means = {c: np.mean(X[y == c], axis=0) for c in self.classes}  # μ
        self.variances = {c: np.var(X[y == c], axis=0) + 1e-6 for c in self.classes}  # σ² (add small value to avoid division by zero)

    def gaussian_pdf(self, x, mean, var):
        """Compute the probability density function of a Gaussian distribution."""
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-((x - mean) ** 2) / (2 * var))

    def predict(self, X):
        """Predict class labels for given data."""
        predictions = []
        for x in X:
            class_probs = {}
            for c in self.classes:
                prior = np.log(self.class_priors[c])  # log(P(y))
                likelihood = np.sum(np.log(self.gaussian_pdf(x, self.means[c], self.variances[c])))  # log(P(X|y))
                class_probs[c] = prior + likelihood  # log(P(y|X))
            predictions.append(max(class_probs, key=class_probs.get))
        return np.array(predictions)

# Train Custom Gaussian Naïve Bayes
nb_custom = CustomGaussianNB()
nb_custom.fit(X_train.to_numpy(), y_train)  # Convert to NumPy array

# Predictions
y_pred_custom = nb_custom.predict(X_test.to_numpy())  # Convert to NumPy array

# Evaluation
accuracy_custom = accuracy_score(y_test, y_pred_custom)
report_custom = classification_report(y_test, y_pred_custom)

print(f"Custom Gaussian Naïve Bayes Accuracy: {accuracy_custom:.4f}")
print("Custom Gaussian Naïve Bayes Classification Report:\n", report_custom)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

class CustomMultinomialNB:
    def __init__(self, alpha=1):
        self.alpha = alpha  # Laplace smoothing parameter

    def fit(self, X, y):
        y = np.array(y)  # Convert to NumPy array for proper indexing
        self.classes = np.unique(y)
        
        # Compute class priors: P(y)
        self.class_priors = {c: np.mean(y == c) for c in self.classes}
        
        # Compute feature counts (Laplace smoothed)
        self.feature_counts = {c: np.sum(X[y == c], axis=0) + self.alpha for c in self.classes}
        
        # Compute feature probabilities: P(X|y)
        self.feature_probs = {c: self.feature_counts[c] / self.feature_counts[c].sum(axis=0, keepdims=True) for c in self.classes}

    def predict(self, X):
        predictions = []
        for x in X:
            class_probs = {}
            for c in self.classes:
                prior = np.log(self.class_priors[c])  # log(P(y))
                
                # Avoid log(0) issues by replacing 0 with a small value
                feature_prob = np.clip(self.feature_probs[c], 1e-10, None)
                
                likelihood = np.sum(np.log(feature_prob) * x)  # log(P(X|y))
                class_probs[c] = prior + likelihood  # log(P(y|X))
            
            predictions.append(max(class_probs, key=class_probs.get))
        
        return np.array(predictions)

# Train Custom Multinomial Naïve Bayes
nb_multinomial = CustomMultinomialNB(alpha=1)
nb_multinomial.fit(X_train.to_numpy(), y_train.to_numpy())  # Convert to NumPy

# Predictions
y_pred_multinomial = nb_multinomial.predict(X_test.to_numpy())  # Convert to NumPy

# Evaluation
accuracy_multinomial = accuracy_score(y_test, y_pred_multinomial)
report_multinomial = classification_report(y_test, y_pred_multinomial)

print(f"\nCustom Multinomial Naïve Bayes Accuracy: {accuracy_multinomial:.4f}")
print("Custom Multinomial Naïve Bayes Classification Report:\n", report_multinomial)


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Ensure numeric X and encode categorical y
X_train, X_test = X_train.apply(pd.to_numeric, errors='coerce'), X_test.apply(pd.to_numeric, errors='coerce')
if isinstance(y_train[0], str):
    le = LabelEncoder()
    y_train, y_test = le.fit_transform(y_train), le.transform(y_test)

class CustomDecisionTree:
    def __init__(self, max_depth=5, criterion="gini"):
        self.max_depth, self.criterion, self.tree = max_depth, criterion, None

    def _entropy(self, y):
        probs = np.bincount(y) / len(y)
        return -np.sum(probs * np.log2(probs + 1e-9))  # Avoid log(0)

    def _gini(self, y):
        probs = np.bincount(y) / len(y)
        return 1 - np.sum(probs**2)

    def _best_split(self, X, y):
        best_feature, best_threshold, best_score = None, None, float('inf')
        score_fn = self._gini if self.criterion == "gini" else self._entropy

        for feature in range(X.shape[1]):
            for threshold in np.unique(X[:, feature]):
                left, right = y[X[:, feature] <= threshold], y[X[:, feature] > threshold]
                if len(left) == 0 or len(right) == 0: continue
                weighted_score = (len(left) * score_fn(left) + len(right) * score_fn(right)) / len(y)
                if weighted_score < best_score:
                    best_feature, best_threshold, best_score = feature, threshold, weighted_score

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or depth >= self.max_depth:
            return np.bincount(y).argmax()
        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None: return np.bincount(y).argmax()

        mask = X[:, best_feature] <= best_threshold
        return {"feature": best_feature, "threshold": best_threshold, 
                "left": self._build_tree(X[mask], y[mask], depth+1),
                "right": self._build_tree(X[~mask], y[~mask], depth+1)}

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _predict_one(self, x, tree):
        return self._predict_one(x, tree["left"]) if isinstance(tree, dict) and x[tree["feature"]] <= tree["threshold"] else (self._predict_one(x, tree["right"]) if isinstance(tree, dict) else tree)

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])

# Train and Evaluate
tree_model = CustomDecisionTree(max_depth=5, criterion="gini")
tree_model.fit(X_train.to_numpy(), y_train)
y_pred_tree = tree_model.predict(X_test.to_numpy())

print(f"Accuracy: {accuracy_score(y_test, y_pred_tree):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_tree))


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train Decision Tree model
tree_model = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)  
tree_model.fit(X_train, y_train)

# Predictions
y_pred_tree = tree_model.predict(X_test)

# Evaluation
accuracy_tree = accuracy_score(y_test, y_pred_tree)
report_tree = classification_report(y_test, y_pred_tree)

print(f"Decision Tree Accuracy: {accuracy_tree:.4f}")
print("Decision Tree Classification Report:\n", report_tree)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train KNN Model
k = 5  # Choose an appropriate value of k
knn_model = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
knn_model.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test)

# Evaluation
accuracy_knn = accuracy_score(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)

print(f"KNN Accuracy: {accuracy_knn:.4f}")
print("KNN Classification Report:\n", report_knn)


In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Ensure X_train and X_test are numeric
if isinstance(X_train, pd.DataFrame):
    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Encode categorical labels (if necessary)
if isinstance(y_train[0], str):
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

class CustomKNN:
    def __init__(self, k=5):
        self.k = k  # Number of neighbors

    def euclidean_distance(self, x1, x2):
        """Calculate Euclidean distance manually"""
        return np.sqrt(np.sum((x1 - x2) ** 2))  # Vectorized for better performance

    def fit(self, X, y):
        """Store training data"""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict_one(self, x):
        """Predict a single sample"""
        # Compute distances from the test sample to all training samples
        distances = np.array([self.euclidean_distance(x, x_train) for x_train in self.X_train])
        
        # Get indices of k nearest neighbors
        k_neighbors_idx = np.argsort(distances)[:self.k]
        
        # Get the labels of k nearest neighbors
        k_neighbors_labels = self.y_train[k_neighbors_idx]
        
        # Return the most common label
        return Counter(k_neighbors_labels).most_common(1)[0][0]

    def predict(self, X):
        """Predict multiple samples"""
        return np.array([self.predict_one(x) for x in X])

# Train Custom KNN Model
k = 5  # Choose k value
knn_custom = CustomKNN(k=k)
knn_custom.fit(X_train.to_numpy(), y_train)  # Convert to NumPy array

# Predictions
y_pred_custom_knn = knn_custom.predict(X_test.to_numpy())  # Convert to NumPy array

# Evaluation
accuracy_custom_knn = accuracy_score(y_test, y_pred_custom_knn)
report_custom_knn = classification_report(y_test, y_pred_custom_knn)

print(f"Custom KNN Accuracy: {accuracy_custom_knn:.4f}")
print("Custom KNN Classification Report:\n", report_custom_knn)


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Normalize the data for better clustering performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)  # Scale training data

# Train K-Means Model
k = 3  # Number of clusters
kmeans_model = KMeans(n_clusters=k, init='k-means++', max_iter=300, random_state=42)
kmeans_model.fit(X_scaled)

# Cluster Assignments
clusters = kmeans_model.predict(X_test)

# Centroid Locations
centroids = kmeans_model.cluster_centers_

print(f"Cluster Assignments:\n{clusters}")
print(f"Centroids:\n{centroids}")


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

class CustomKMeans:
    def __init__(self, k=3, max_iters=100, tol=1e-4):
        self.k = k  # Number of clusters
        self.max_iters = max_iters  # Maximum number of iterations
        self.tol = tol  # Tolerance for convergence

    def euclidean_distance(self, x1, x2):
        """Calculate Euclidean distance manually (Vectorized)"""
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def initialize_centroids(self, X):
        """Randomly initialize k centroids"""
        np.random.seed(42)  # For reproducibility
        indices = np.random.choice(len(X), self.k, replace=False)
        return X[indices]

    def assign_clusters(self, X, centroids):
        """Assign each data point to the nearest centroid"""
        distances = np.array([[self.euclidean_distance(x, centroid) for centroid in centroids] for x in X])
        return np.argmin(distances, axis=1)

    def update_centroids(self, X, clusters):
        """Update centroids by computing the mean of assigned points"""
        new_centroids = np.array([X[clusters == i].mean(axis=0) if np.any(clusters == i) else self.centroids[i] for i in range(self.k)])
        return new_centroids

    def fit(self, X):
        """Train the K-Means model"""
        X = np.array(X)  # Ensure it's a NumPy array
        self.centroids = self.initialize_centroids(X)

        for _ in range(self.max_iters):
            clusters = self.assign_clusters(X, self.centroids)
            new_centroids = self.update_centroids(X, clusters)

            # Check for convergence
            if np.max(np.abs(new_centroids - self.centroids)) < self.tol:
                break

            self.centroids = new_centroids
        
        self.clusters = clusters  # Store final clusters

    def predict(self, X):
        """Assign new data points to the nearest cluster"""
        return self.assign_clusters(X, self.centroids)


# Load and Normalize Data (if needed)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train.to_numpy())  # Convert to NumPy array and normalize

# Train Custom K-Means Model
k = 3  # Number of clusters
kmeans_custom = CustomKMeans(k=k)
kmeans_custom.fit(X_scaled)

# Predictions
clusters = kmeans_custom.predict(X_test.to_numpy())  # Convert to NumPy array

print(f"Cluster Assignments:\n{clusters}")
