In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
file_path = 'final_obesity.csv'
data = pd.read_csv(file_path)

# Drop the 'id' column
data = data.drop(columns=['id'])

# Strip leading/trailing spaces from categorical columns
data['Zone'] = data['Zone'].str.strip()
data['Gender'] = data['Gender'].str.strip()

# Calculate BMI using the standard formula
data['Height_m'] = data['Height'] / 100  # Convert height from cm to meters
data['Bmi_calculated'] = data['Weight'] / (data['Height_m'] ** 2)

# Define features and target variable
X = data.drop(columns=['BmiWcRiskScore', 'Bmi', 'Height_m', 'Dailyphysicalactivity','Moderatephysicalactivity', 'Vigorousphysicalactivity'])
y = data['BmiWcRiskScore']

# Identify categorical features for one-hot encoding
categorical_features = ['Zone', 'Gender']

# Define the possible categories for each categorical feature based on the dataset
zone_categories = sorted(data['Zone'].unique())
gender_categories = sorted(data['Gender'].unique())


# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=[zone_categories, gender_categories]), categorical_features),
        ('num', StandardScaler(), ['Age', 'Waist', 'Weight', 'Bmi_calculated'])
    ],
    remainder='passthrough'
)

# Initialize the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Create a pipeline that first transforms the data and then fits the model
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Save the model
joblib.dump(clf, 'decision_tree_model.pkl')




Accuracy: 1.00


['decision_tree_model.pkl']