In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the Excel file
file_path = '/content/drive/My Drive/7 nov as_1/Zero_Hunger.xlsx'
df = pd.read_excel(file_path)


Mounted at /content/drive

# Step 1: Fill missing values in numeric columns
numeric_df = df.select_dtypes(include=['number'])
df[numeric_df.columns] = numeric_df.fillna(numeric_df.mean())


# Step 2: Encode categorical variables
label_encoder = LabelEncoder()
if 'Entity' in df.columns:  # Check if 'Entity' column exists to avoid KeyError
    df['Entity'] = label_encoder.fit_transform(df['Entity'])
else:
    print("Error: 'Entity' column not found in data.")



drop_columns = ['Entity', 'Code']  # Only keep necessary columns to drop
X = df.drop(columns=drop_columns, errors='ignore')
y = df['Prevalence of undernourishment (% of population)']


# Step 4: Apply SMOTE to handle class imbalance
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)  # Execute this cell first





# Step 5: Feature Scaling
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)



# Step 6: K-Fold Cross Validation and Hyperparameter Tuning
kf = KFold(n_splits=5, shuffle=True, random_state=42)




# Define parameter grids for hyperparameter tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}

# Dictionary of models with GridSearchCV for hyperparameter tuning
models = {
    'Logistic Regression': GridSearchCV(LogisticRegression(), param_grid_lr, cv=kf),
    'Random Forest': GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=kf)
}

# Step 7: Cross-validation and metrics calculation
results = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X_resampled, y_resampled, cv=kf)
    accuracy = accuracy_score(y_resampled, y_pred)
    precision = precision_score(y_resampled, y_pred)
    recall = recall_score(y_resampled, y_pred)
    f1 = f1_score(y_resampled, y_pred)
    roc_auc = roc_auc_score(y_resampled, y_pred)
    conf_matrix = confusion_matrix(y_resampled, y_pred)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("\n")