In [9]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Try importing visualization libraries
visualization_available = True
try:
    import matplotlib
    matplotlib.use('Agg')  # Use non-interactive backend
    import matplotlib.pyplot as plt
    import seaborn as sns
except ImportError as e:
    print(f"Visualization libraries not available: {e}")
    visualization_available = False

Visualization libraries not available: cannot import name 'axes' from 'matplotlib' (c:\Users\91952\AppData\Local\Programs\Python\Python310\lib\site-packages\matplotlib\__init__.py)


In [11]:
# Try importing XGBoost
xgboost_available = True
try:
    import xgboost as xgb
except ImportError as e:
    print(f"XGBoost not available: {e}")
    xgboost_available = False

In [12]:

# Load the dataset
diabetes_dataset = pd.read_csv('newDiabetes.csv')

# Display basic information
print("Dataset Shape:", diabetes_dataset.shape)
print("\nFirst 5 rows:\n", diabetes_dataset.head())
print("\nDataset Info:\n")
diabetes_dataset.info()


Dataset Shape: (1000, 14)

First 5 rows:
     ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   No_Pation  1000 non-null   int64  
 2   Gender     1000 non-null   object 
 3   AGE        1000 non-null   int64  
 4   Urea       1000 non-null   float

In [13]:

def preprocess_data(df):
    # Drop ID and No_Pation columns as they are not relevant for prediction
    df = df.drop(['ID', 'No_Pation'], axis=1)
    
        # Convert Gender to numeric
    df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})
    
    # Handle CLASS (target variable)
    df['CLASS'] = df['CLASS'].map({'N': 0, 'Y': 1, 'P': 2})
    
    # Check for any invalid values in numeric columns
    numeric_columns = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Remove rows with any invalid values
    df = df.dropna()
    
    return df

In [14]:
# Apply preprocessing
processed_df = preprocess_data(diabetes_dataset)
print("\nProcessed Dataset Shape:", processed_df.shape)
print("\nClass Distribution:\n", processed_df['CLASS'].value_counts())

# Only show visualizations if libraries are available
if visualization_available:
    # Let's visualize some key features
    plt.figure(figsize=(15, 10))
    plt.subplot(2, 3, 1)
    sns.boxplot(x='CLASS', y='HbA1c', data=processed_df)
    plt.title('HbA1c by CLASS')
    
    plt.subplot(2, 3, 2)
    sns.boxplot(x='CLASS', y='Urea', data=processed_df)
    plt.title('Urea by CLASS')
    
    plt.subplot(2, 3, 3)
    sns.boxplot(x='CLASS', y='BMI', data=processed_df)
    plt.title('BMI by CLASS')
    
    plt.subplot(2, 3, 4)
    sns.boxplot(x='CLASS', y='AGE', data=processed_df)
    plt.title('Age by CLASS')
    
    plt.subplot(2, 3, 5)
    sns.boxplot(x='CLASS', y='Chol', data=processed_df)
    plt.title('Cholesterol by CLASS')
    
    plt.subplot(2, 3, 6)
    sns.boxplot(x='CLASS', y='TG', data=processed_df)
    plt.title('Triglycerides by CLASS')
    
    plt.tight_layout()
    plt.show()
else:
    print("\nVisualization not available - skipping data visualization")




Processed Dataset Shape: (994, 12)

Class Distribution:
 1.0    839
0.0    102
2.0     53
Name: CLASS, dtype: int64

Visualization not available - skipping data visualization


In [15]:

X = processed_df.drop('CLASS', axis=1)
y = processed_df['CLASS']
# Only show correlations if visualization is available
if visualization_available:
    # Calculate feature correlations
    plt.figure(figsize=(12, 10))
    corr = X.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.show()
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Cell 5: Model Training - Random Forest
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [16]:
# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform GridSearchCV
rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy'
)

In [17]:
# Train the Random Forest model
print("\nTraining Random Forest model...")
rf_grid_search.fit(X_train_scaled, y_train)

# Get best Random Forest model
rf_best_model = rf_grid_search.best_estimator_

# Make predictions with Random Forest
rf_y_pred = rf_best_model.predict(X_test_scaled)

# Print Random Forest results
print("\nRandom Forest Best Parameters:", rf_grid_search.best_params_)
print(f"\nRandom Forest Accuracy Score: {accuracy_score(y_test, rf_y_pred):.4f}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))


Training Random Forest model...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Random Forest Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Random Forest Accuracy Score: 0.9698

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95        20
         1.0       0.98      1.00      0.99       168
         2.0       1.00      0.45      0.62        11

    accuracy                           0.97       199
   macro avg       0.96      0.82      0.86       199
weighted avg       0.97      0.97      0.96       199

