In [10]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Function to collect user input
def get_user_input():
    age = float(input("Enter age: "))
    gender = input("Enter gender (male/female): ").lower()
    weight = float(input("Enter weight (in kg): "))
    height = float(input("Enter height (in cm): "))
    bmi = float(input("Enter BMI: "))
    physical_activity_level = float(input("Enter Physical Activity Level: "))
    return age, gender, weight, height, bmi, physical_activity_level

# Function to preprocess and encode user input
def preprocess_input(age, gender, weight, height, bmi, physical_activity_level):
    gender_encoder = LabelEncoder()
    gender_encoded = gender_encoder.fit_transform([gender])
    features = np.array([age, gender_encoded[0], weight, height, bmi, physical_activity_level])
    features = features.reshape(1, -1)
    return features

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
data = np.genfromtxt(r"D:\Foundathon\obesity_data.csv", delimiter=',', skip_header=1)

# ... (previous code)

# Handle NaN values by removing rows with missing values
data = data[~np.isnan(data).any(axis=1)]

# Check if there are enough samples for a train-test split
if len(data) < 2:
    raise ValueError("Not enough samples for a meaningful train-test split.")
else:
    print(f"Number of samples after removing NaN values: {len(data)}")

# Augment the dataset with small random perturbations
augmented_data = np.copy(data)
augmented_data[:, :5] += np.random.normal(0, 0.1, size=(len(data), 5))

# Separate features and labels for augmented data
X_augmented = augmented_data[:, :6]  # Features: Age, Gender, Height, Weight, BMI, PhysicalActivityLevel
y_augmented = augmented_data[:, 6]    # Labels: ObesityCategory

# Encode gender in the augmented dataset
gender_encoder = LabelEncoder()
X_augmented[:, 1] = gender_encoder.fit_transform(X_augmented[:, 1])

# Combine the original and augmented datasets
X_combined = np.vstack((X, X_augmented))
y_combined = np.hstack((y, y_augmented))

# Split the combined data into training and testing sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42
)

# ... (rest of the code)


# Create a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy}")

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

# Get user input for testing the model
user_input = [
    get_user_input() for _ in range(5)  # Collect input from 5 users for testing
]

# Preprocess user input and make predictions
for user_data in user_input:
    user_features = preprocess_input(*user_data)
    prediction = model.predict(user_features)
    print(f"User input: {user_data}, Model predicts Obesity Category: {prediction[0]}")


ValueError: Not enough samples for a meaningful train-test split.