In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/kaggle/input/diabetes/diabetes_input.csv')

# Display first 5 rows
print("First 5 rows of the dataset:")
print(df.head())

# Dataset Info
print("\nDataset Info:")
print(df.info())

# Summary Statistics
print("\nSummary Statistics:")
print(df.describe())

# Missing Values
print("\nMissing Values:")
print(df.isnull().sum())

# Check for 'Outcome' column and compute class distribution
if 'Outcome' in df.columns:
    print("\nClass Distribution (Outcome):")
    print(df['Outcome'].value_counts())
    print("\nPercentage Distribution:")
    print(df['Outcome'].value_counts(normalize=True) * 100)
else:
    print("\n⚠️ 'Outcome' column not found in dataset.")


First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset (update the file path as needed)
df = pd.read_csv('/kaggle/input/diabetes-with-outcome/diabetes.csv')

# Separate features and target
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Train and evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


                 Model  Accuracy  Precision  Recall  F1-Score
0  Logistic Regression  0.735931   0.617284  0.6250  0.621118
1        Random Forest  0.740260   0.621951  0.6375  0.629630
2                  SVM  0.744589   0.643836  0.5875  0.614379


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('/kaggle/input/diabetes-with-outcome/diabetes.csv')

# Separate features and target
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Cross-validation
results = []
for name, model in models.items():
    scores = cross_validate(model, X_scaled, y, cv=10, scoring=scoring)
    results.append({
        'Model': name,
        'Accuracy': scores['test_accuracy'].mean(),
        'Precision': scores['test_precision'].mean(),
        'Recall': scores['test_recall'].mean(),
        'F1-Score': scores['test_f1'].mean()
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression  0.772180   0.731493  0.559687  0.631430
1        Random Forest  0.768182   0.702923  0.596866  0.640942
2                  SVM  0.760390   0.707082  0.548148  0.613324


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Import all required classifiers
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.tree import ExtraTreeClassifier

# Load the dataset
df = pd.read_csv('/kaggle/input/diabetes-with-outcome/diabetes.csv')
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# List of models
mn = [
    KNN(), DTC(), GaussianNB(), LDA(), SVC(), LinearSVC(max_iter=10000),
    AdaBoostClassifier(), RandomForestClassifier(), Perceptron(),
    ExtraTreeClassifier(), BaggingClassifier(), LogisticRegression(max_iter=10000),
    GradientBoostingClassifier()
]

model_names = [
    "KNN", "Decision Tree", "GaussianNB", "LDA", "SVC", "LinearSVC",
    "AdaBoost", "Random Forest", "Perceptron", "Extra Tree", "Bagging",
    "Logistic Regression", "Gradient Boosting"
]

# Train, predict, and evaluate
for i in range(len(mn)):
    print(f"\nModel {i+1}: {model_names[i]}")
    model = mn[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



Model 1: KNN
Accuracy: 0.6926406926406926
Confusion Matrix:
 [[119  32]
 [ 39  41]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77       151
           1       0.56      0.51      0.54        80

    accuracy                           0.69       231
   macro avg       0.66      0.65      0.65       231
weighted avg       0.69      0.69      0.69       231


Model 2: Decision Tree
Accuracy: 0.70995670995671
Confusion Matrix:
 [[110  41]
 [ 26  54]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.73      0.77       151
           1       0.57      0.68      0.62        80

    accuracy                           0.71       231
   macro avg       0.69      0.70      0.69       231
weighted avg       0.73      0.71      0.71       231


Model 3: GaussianNB
Accuracy: 0.7445887445887446
Confusion Matrix:
 [[119  32]
 [ 27  53]]
Classification Report:
     