In [30]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [31]:
# Cell 2: Load and explore data
# Load the dataset
diabetes_dataset = pd.read_csv('newDiabetes.csv')

# Display basic information
print("Dataset Shape:", diabetes_dataset.shape)
print("\nFirst 5 rows:\n", diabetes_dataset.head())
print("\nDataset Info:\n")
diabetes_dataset.info()

Dataset Shape: (1000, 14)

First 5 rows:
     ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   No_Pation  1000 non-null   int64  
 2   Gender     1000 non-null   object 
 3   AGE        1000 non-null   int64  
 4   Urea       1000 non-null   float

In [32]:
# Cell 3: Data Preprocessing
def preprocess_data(df):
    # Drop ID and No_Pation columns as they are not relevant for prediction
    df = df.drop(['ID', 'No_Pation'], axis=1)
    
    # Convert Gender to numeric
    df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})
    
    # Handle CLASS (target variable)
    df['CLASS'] = df['CLASS'].map({'N': 0, 'Y': 1, 'P': 2})
    
    # Check for any invalid values in numeric columns
    numeric_columns = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Remove rows with any invalid values
    df = df.dropna()
    
    return df

In [33]:
# Apply preprocessing
processed_df = preprocess_data(diabetes_dataset)
print("\nProcessed Dataset Shape:", processed_df.shape)
print("\nClass Distribution:\n", processed_df['CLASS'].value_counts())

# Cell 4: Split features and target
X = processed_df.drop('CLASS', axis=1)
y = processed_df['CLASS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Cell 5: Model Training
# Define parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
# Initialize SVM classifier
svm = SVC(random_state=42, probability=True)

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)


Processed Dataset Shape: (994, 12)

Class Distribution:
 1.0    839
0.0    102
2.0     53
Name: CLASS, dtype: int64


In [34]:
# Train the model
print("\nTraining SVM model...")
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_model = grid_search.best_estimator_

# Cell 6: Model Evaluation
# Make predictions
y_pred = best_model.predict(X_test_scaled)


Training SVM model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [35]:
# Print results
print("\nBest Parameters:", grid_search.best_params_)
print(f"\nAccuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

Accuracy Score: 0.9296

Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85        20
         1.0       0.95      0.98      0.97       168
         2.0       0.50      0.27      0.35        11

    accuracy                           0.93       199
   macro avg       0.77      0.70      0.72       199
weighted avg       0.92      0.93      0.92       199



In [36]:
# Cell 7: Save the model
import joblib
# Save the model and scaler
joblib.dump(best_model, 'diabetes_model_svm.joblib')
joblib.dump(scaler, 'scaler_svm.joblib')
print("\nModel and scaler saved successfully!")


Model and scaler saved successfully!


In [37]:
def preprocess_data(df, is_training=False):
    """
    Preprocess the data
    
    Parameters:
    df (DataFrame): Input DataFrame
    is_training (bool): Whether preprocessing training data or prediction data
    """
    # Create a copy to avoid modifying original data
    df = df.copy()
    
    # Drop ID and No_Pation columns
    if 'ID' in df.columns:
        df = df.drop(['ID', 'No_Pation'], axis=1)
    
    # Convert Gender to numeric
    df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})
    
    # Handle CLASS only if present (training data)
    if 'CLASS' in df.columns and is_training:
        df['CLASS'] = df['CLASS'].map({'N': 0, 'Y': 1, 'P': 2})
    
    # Convert numeric columns
    numeric_columns = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Remove rows with any invalid values
    df = df.dropna()
    
    return df

def predict_diabetes(input_data=None):
    """
    Make predictions for new data
    """
    if input_data is None:
        # Use the sample data provided
        input_data = {
            'ID': 421,
            'No_Pation': 34227,
            'Gender': 'M',
            'AGE': 48,
            'Urea': 4.6,
            'Cr': 47,
            'HbA1c': 4,
            'Chol': 2.9,
            'TG': 0.8,
            'HDL': 0.9,
            'LDL': 1.6,
            'VLDL': 0.4,
            'BMI': 24
        }
    
    try:
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])
        
        # Preprocess input (set is_training=False)
        input_df = preprocess_data(input_df, is_training=False)
        
        # Scale features
        input_scaled = scaler.transform(input_df)
        
        # Make prediction
        prediction = best_model.predict(input_scaled)
        probabilities = best_model.predict_proba(input_scaled)
        
        # Map prediction to class
        class_map = {0: 'Non-Diabetic', 1: 'Diabetic', 2: 'Pre-Diabetic'}
        result = class_map[prediction[0]]
        
        return {
            'prediction': result,
            'probabilities': {
                'Non-Diabetic': probabilities[0][0],
                'Diabetic': probabilities[0][1],
                'Pre-Diabetic': probabilities[0][2]
            }
        }
    
    except Exception as e:
        return f"Error making prediction: {str(e)}"

# Test the model with the sample data
result = predict_diabetes()
print("\nPrediction Results:")
if isinstance(result, dict):
    print(f"Prediction: {result['prediction']}")
    print("\nProbabilities:")
    for class_name, prob in result['probabilities'].items():
        print(f"{class_name}: {prob:.4f}")
else:
    print(result)


Prediction Results:
Prediction: Non-Diabetic

Probabilities:
Non-Diabetic: 0.9834
Diabetic: 0.0033
Pre-Diabetic: 0.0133
