In [23]:
!pip install pandas scikit-learn



In [24]:
from google.colab import files
uploaded = files.upload()

Saving health_lifestyle_classification.csv to health_lifestyle_classification.csv


In [26]:
import pandas as pd

# Load the CSV file (update the path if using Google Drive)
data = pd.read_csv('health_lifestyle_classification.csv')

# Display the first few rows and data info
print(data.head())
print(data.info())

   age  gender      height     weight        bmi  bmi_estimated  bmi_scaled  \
0   56    Male  173.416872  56.886640  18.915925      18.915925   56.747776   
1   69  Female  163.207380  97.799859  36.716278      36.716278  110.148833   
2   46    Male  177.281966  80.687562  25.673050      25.673050   77.019151   
3   32  Female  172.101255  63.142868  21.318480      21.318480   63.955440   
4   60  Female  163.608816  40.000000  14.943302      14.943302   44.829907   

   bmi_corrected  waist_size  blood_pressure  ...  sunlight_exposure  \
0      18.989117   72.165130      118.264254  ...               High   
1      36.511417   85.598889      117.917986  ...               High   
2      25.587429   90.295030      123.073698  ...               High   
3      21.177109  100.504211      148.173453  ...               High   
4      14.844299   69.021150      150.613180  ...               High   

   meals_per_day  caffeine_intake  family_history  pet_owner  \
0              5         Mod

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Separate features and target
X = data.drop('target', axis=1)  # All columns except 'target'
y = data['target']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.ensemble import RandomForestClassifier

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

In [29]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6985

Classification Report:
               precision    recall  f1-score   support

    diseased       0.37      0.00      0.00      6023
     healthy       0.70      1.00      0.82     13977

    accuracy                           0.70     20000
   macro avg       0.53      0.50      0.41     20000
weighted avg       0.60      0.70      0.58     20000



In [30]:
# single_sample = single_sample.drop('survey_code', axis=1)

import pickle
with open('health_prediction_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open('health_prediction_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
import pandas as pd

# Create single test sample with provided values
single_sample = pd.DataFrame({
    'survey_code': [143],
    'age': [34],
    'gender': ['Male'],
    'height': [160.8129287],
    'weight': [87.99562442],
    'bmi': [34.02664692],
    'bmi_estimated': [34.02664692],
    'bmi_scaled': [102.0799408],
    'bmi_corrected': [34.13719895],
    'waist_size': [98.41608367],
    'blood_pressure': [111.98239],
    'heart_rate': [80.19802017],
    'cholesterol': [128.0392769],
    'glucose': [111.2221661],
    'insulin': [18.11037723],
    'sleep_hours': [7.479686019],
    'sleep_quality': ['Excellent'],
    'work_hours': [8.277346382],
    'physical_activity': [4.50562205],
    'daily_steps': [8502.977659],
    'calorie_intake': [2191.652711],
    'sugar_intake': [65.62886298],
    'alcohol_consumption': ['None'],
    'smoking_level': ['Light'],
    'water_intake': [2.947351133],
    'screen_time': [5.982054146],
    'stress_level': [2],
    'mental_health_score': [2],
    'mental_health_support': ['No'],
    'education_level': ['Bachelor'],
    'job_type': ['Healthcare'],
    'occupation': ['Doctor'],
    'income': [4639.158189],
    'diet_type': ['Omnivore'],
    'exercise_type': ['Mixed'],
    'device_usage': ['Low'],
    'healthcare_access': ['Moderate'],
    'insurance': ['No'],
    'sunlight_exposure': ['Moderate'],
    'meals_per_day': [4],
    'caffeine_intake': ['High'],
    'family_history': ['Yes'],
    'pet_owner': ['Yes'],
    'electrolyte_level': [0],
    'gene_marker_flag': [1],
    'environmental_risk_score': [5.5],
    'daily_supplement_dosage': [8.188736505]
})

# Display sample to verify
print(single_sample)

In [None]:
# Make prediction
prediction = model.predict(single_sample)

# Display the prediction
print("Prediction for single sample:", prediction[0])