In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load your data
# Assuming 'train.csv' contains your training data and 'test.csv' contains your test data
train_df = pd.read_csv("D:\\Foundathon\\Obesity prediction\\train.csv")
test_df = pd.read_csv("D:\\Foundathon\\Obesity prediction\\test.csv")

# Display basic information about the training data
print("Training Data Info:")
print(train_df.info())

# Display basic information about the test data
print("\nTest Data Info:")
print(test_df.info())

# Specify features and target variable
training_features = [
    'id', 'Gender', 'Age', 'Height', 'Weight',
    'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
    'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS'
]

target_variable = 'NObeyesdad'

# Selecting only the relevant features for training
X_train = train_df[training_features]
y_train = train_df[target_variable]

# Data preprocessing
# You can add more preprocessing steps based on your data
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
numeric_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Selecting only the relevant features for training
X_train_categorical = X_train[categorical_features]
X_train_numeric = X_train[numeric_features]

# One-hot encoding for categorical features
X_train_categorical_encoded = pd.get_dummies(X_train_categorical, drop_first=True)

# Concatenate one-hot encoded categorical features with numeric features
X_train_processed = pd.concat([X_train_numeric, X_train_categorical_encoded], axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)

# Split the data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Hyperparameter tuning (optional)
# ... (your existing code)
# Define the parameter grid
param_grid = {
    'n_estimators': [50],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Create a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_split, y_train_split)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:")
print(best_params)

# Train the model with the best parameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train_split, y_train_split)

# Model evaluation on the validation set
predictions_val = best_rf_model.predict(X_val)
accuracy_val = accuracy_score(y_val, predictions_val)

print(f'Validation Accuracy: {accuracy_val:.2f}')

# Feature importance (optional)
feature_importance = best_rf_model.feature_importances_
print('\nFeature Importance:')
for feature, importance in zip(training_features, feature_importance):
    print(f'{feature}: {importance:.4f}')

# Preprocess the test data
# Use the same scaler that was fit on the training data
X_test_categorical = test_df[categorical_features]
X_test_numeric = test_df[numeric_features]

# One-hot encoding for categorical features
X_test_categorical_encoded = pd.get_dummies(X_test_categorical, drop_first=True)

# Make sure column names match those in the training data
missing_columns = set(X_train_categorical_encoded.columns) - set(X_test_categorical_encoded.columns)
for column in missing_columns:
    X_test_categorical_encoded[column] = 0

# Concatenate one-hot encoded categorical features with numeric features
X_test_processed = pd.concat([X_test_numeric, X_test_categorical_encoded], axis=1)

# Reorder the columns to match the order during training
# Reorder the columns to match the order during training
X_test_processed = X_test_processed[X_train_processed.columns]


# Use the same scaler that was fit on the training data
X_test_scaled = scaler.transform(X_test_processed)

# Model prediction on the test set
predictions_test = best_rf_model.predict(X_test_scaled)

# Assuming 'output.csv' as the output file for predictions
output_df = pd.DataFrame({'id': test_df['id'], 'Predicted_Label': predictions_test})
output_df.to_csv('output.csv', index=False)


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC         

In [17]:
# Train the model with the best parameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train_split, y_train_split)

# Save the trained model to a file
joblib.dump(best_rf_model, 'best_rf_model.joblib')


['best_rf_model.joblib']

In [16]:
# app.py
from flask import Flask, render_template, request
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib  # For scikit-learn versions <= 0.23.2
# from joblib import load  # For scikit-learn versions >= 0.24

app = Flask(__name__)

# Load the trained model and scaler
best_rf_model = joblib.load("rf_model.joblib")
scaler = joblib.load("scaler.joblib")

# Define the route for the home page
@app.route('/')
def home():
    return render_template('index.html')

# Define the route for prediction
@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        # Get input data from the form
        input_data = {
            'Gender': request.form['gender'],
            'Age': float(request.form['age']),
            'Height': float(request.form['height']),
            'Weight': float(request.form['weight']),
            # ... (add other input fields)
        }

        # Create a DataFrame from the input data
        input_df = pd.DataFrame([input_data])

        # Preprocess the input data
        input_categorical = input_df[categorical_features]
        input_numeric = input_df[numeric_features]
        input_categorical_encoded = pd.get_dummies(input_categorical, drop_first=True)
        input_processed = pd.concat([input_numeric, input_categorical_encoded], axis=1)
        input_scaled = scaler.transform(input_processed)

        # Make a prediction using the trained model
        prediction = best_rf_model.predict(input_scaled)[0]

        # Map the prediction to the corresponding label
        labels = {'Insufficient_Weight': 'Underweight', 'Normal_Weight': 'Normal weight', 'Overweight_Level_I': 'Overweight',
                  'Overweight_Level_II': 'Obesity Type I', 'Overweight_Level_III': 'Obesity Type II',
                  'Overweight_Level_III': 'Obesity Type III'}
        predicted_label = labels[prediction]

        return render_template('index.html', prediction=predicted_label)

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True)


ImportError: cannot import name 'joblib' from 'sklearn.externals' (c:\Users\grove\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\externals\__init__.py)

In [15]:
import joblib

In [2]:
print(train_df.columns)


Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
