In [8]:
#Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix



In [9]:
#Load the Dataset
url = "https://github.com/mishravipul/data/raw/main/obesity_data.csv"
data = pd.read_csv(url)

print(data.head())

   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_Weight  
2  Public_Transportation        

In [19]:
#Data Preprocessing

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
url = "https://github.com/mishravipul/data/raw/main/obesity_data.csv"
data = pd.read_csv(url)

# Add intercept column with all values=1
data['Intercept'] = 1

# Convert 'Gender' column to numbers where 'Female' is 1 and 'Male' is 0
data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

# Convert yes/no columns to 1/0
yes_no_columns = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for col in yes_no_columns:
    data[col] = data[col].apply(lambda x: 1 if x == 'yes' else 0)

# One-Hot encode 'MTRANS' and 'NObeyesdad' columns
data = pd.get_dummies(data, columns=['MTRANS', 'NObeyesdad'], drop_first=False)

# Verify the dataset structure after one-hot encoding
print("Columns after one-hot encoding:", data.columns)

# Label encode 'CAEC' and 'CALC' columns
label_encoders = {}
for col in ['CAEC', 'CALC']:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    else:
        raise KeyError(f"Column '{col}' is missing in the dataset.")

# Normalize the dataset
scaler = StandardScaler()

# Identify the target column dynamically
# The target columns are the one-hot encoded columns for 'NObeyesdad'
target_columns = [col for col in data.columns if col.startswith('NObeyesdad_')]

# Use the first one-hot encoded column as the target (or choose a specific one)
target_column = target_columns[0]  # Change this if you want a specific target
print("Target column:", target_column)

# Drop the target columns before scaling
X = data.drop(columns=target_columns).values
y = data[target_column].values

# Normalize the features (X)
X_scaled = scaler.fit_transform(X)

# Print the shape and datatype of both X and y
print("Shape of X:", X_scaled.shape)
print("Shape of y:", y.shape)
print("Datatype of X:", X_scaled.dtype)
print("Datatype of y:", y.dtype)

Columns after one-hot encoding: Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'Intercept', 'MTRANS_Automobile', 'MTRANS_Bike',
       'MTRANS_Motorbike', 'MTRANS_Public_Transportation', 'MTRANS_Walking',
       'NObeyesdad_Insufficient_Weight', 'NObeyesdad_Normal_Weight',
       'NObeyesdad_Obesity_Type_I', 'NObeyesdad_Obesity_Type_II',
       'NObeyesdad_Obesity_Type_III', 'NObeyesdad_Overweight_Level_I',
       'NObeyesdad_Overweight_Level_II'],
      dtype='object')
Target column: NObeyesdad_Insufficient_Weight
Shape of X: (2111, 21)
Shape of y: (2111,)
Datatype of X: float64
Datatype of y: bool


In [22]:
#Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1688, 21)
Shape of X_test: (423, 21)
Shape of y_train: (1688,)
Shape of y_test: (423,)


In [25]:
#Implement Logistic Regression with Softmax
import numpy as np
import pandas as pd  # Ensure pandas is imported

class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iterations=1000):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.weights = None

    def softmax(self, z):
        print(f"Debug: z type = {type(z)}, shape = {z.shape}")  # Debugging
        z = np.array(z, dtype=np.float64)  # Ensure z is NumPy array
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def cross_entropy_loss(self, y, y_pred):
        m = y.shape[0]
        return -np.sum(y * np.log(y_pred + 1e-9)) / m  # Avoid log(0) error

    def fit(self, X, y):
        X = np.array(X, dtype=np.float64)  # Ensure X is NumPy array
        y = np.array(y, dtype=np.float64)  # Ensure y is NumPy array

        if y.ndim == 1:
            y = y.reshape(-1, 1)  # Ensure y is 2D

        m, n = X.shape
        k = y.shape[1]
        self.weights = np.zeros((n, k))

        for i in range(self.max_iterations):
            z = np.dot(X, self.weights)
            y_pred = self.softmax(z)
            gradient = np.dot(X.T, (y_pred - y)) / m
            self.weights -= self.learning_rate * gradient

            if i % 100 == 0:
                loss = self.cross_entropy_loss(y, y_pred)
                print(f"Iteration {i}, Loss: {loss}")

    def predict_proba(self, X):
        X = np.array(X, dtype=np.float64)  # Ensure X is NumPy array
        z = np.dot(X, self.weights)
        return self.softmax(z)

    def predict(self, X):
        y_pred = self.predict_proba(X)
        return np.argmax(y_pred, axis=1)

# One-hot encode the target variable
y_train_onehot = pd.get_dummies(pd.Series(y_train)).values  # Ensure categorical encoding
y_test_onehot = pd.get_dummies(pd.Series(y_test)).values

# Convert input to NumPy array
X_train = np.array(X_train, dtype=np.float64)
y_train_onehot = np.array(y_train_onehot, dtype=np.float64)

# Initialize and train the model
model = LogisticRegression(learning_rate=0.01, max_iterations=1000)
model.fit(X_train, y_train_onehot)


Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Iteration 0, Loss: 0.6931471785599455
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <c

In [26]:
#Evaluate the Model

# Predict on train and test data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(np.argmax(y_train_onehot, axis=1), y_train_pred)
test_accuracy = accuracy_score(np.argmax(y_test_onehot, axis=1), y_test_pred)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(np.argmax(y_test_onehot, axis=1), y_test_pred))

print("Confusion Matrix:")
print(confusion_matrix(np.argmax(y_test_onehot, axis=1), y_test_pred))

Debug: z type = <class 'numpy.ndarray'>, shape = (1688, 2)
Debug: z type = <class 'numpy.ndarray'>, shape = (423, 2)
Train Accuracy: 0.8767772511848341
Test Accuracy: 0.8699763593380615
Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       367
           1       1.00      0.02      0.04        56

    accuracy                           0.87       423
   macro avg       0.93      0.51      0.48       423
weighted avg       0.89      0.87      0.81       423

Confusion Matrix:
[[367   0]
 [ 55   1]]
