In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv('/content/dataset.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
class_counts = data['Outcome'].value_counts()
print(class_counts)

# Visualize class distribution
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.countplot(x='Outcome', data=data, color='pink')
plt.xticks([0, 1], ['Non-Diabetic', 'Diabetic'])
plt.title('Result Disribution')
plt.xlabel('Result')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(X.columns):
  plt.subplot(3, 3, i+1)
  sns.boxplot(x='Outcome', y=col, data=data)
  plt.title(col)
plt.tight_layout()
plt.show()

# **Balanced Dataset**

In [None]:
from imblearn.over_sampling import SMOTE

X = data.drop('Outcome', axis=1)
y = data['Outcome']

sm = SMOTE(random_state=33)
X1, y1 = sm.fit_resample(X, y)
df = pd.concat([pd.DataFrame(X1), pd.DataFrame(y1, columns=['Outcome'])], axis=1)

count = df['Outcome'].value_counts()
print(count)

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.countplot(x='Outcome', data=df, color='pink')
plt.xticks([0, 1], ['Non-Diabetic', 'Diabetic'])
plt.title('Result Disribution After Balancing')
plt.xlabel('Result')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(X.columns):
  plt.subplot(3, 3, i+1)
  sns.boxplot(x='Outcome', y=col, data=df)
  plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
df

In [None]:
corr = df.corr()
plt.figure(figsize=(8,8))
sns.heatmap(data =corr,annot = True)

In [None]:
df1 = df.drop(columns=['BloodPressure','Insulin','SkinThickness'], axis=1)

In [None]:
df1

In [None]:
df1.isna().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
df2 = sc.fit_transform(df1)
df2 = pd.DataFrame(df2, columns=df1.columns)
df2

In [None]:
from sklearn.model_selection import train_test_split

X1 = df2.drop('Outcome', axis=1)
y1 = df2['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=33)

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

In [None]:
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)
print("SVM F1-Score:", svm_f1)
print("SVM Confusion Matrix:\n", svm_conf_matrix)

In [None]:
rf_model = RandomForestClassifier(random_state=33)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

In [None]:
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)

print("\nRandom Forest Accuracy:", rf_accuracy)
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest F1-Score:", rf_f1)
print("Random Forest Confusion Matrix:\n", rf_conf_matrix)

# UnBalanced Dataset

In [None]:
data

In [None]:
corr = data.corr()
plt.figure(figsize=(8,8))
sns.heatmap(data =corr,annot = True)

In [None]:
data1 = data.drop(columns=['BloodPressure','SkinThickness'], axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

data2 = sc.fit_transform(data1)
data2 = pd.DataFrame(data2, columns=data1.columns)
data2

In [None]:
X1 = data2.drop('Outcome', axis=1)
y1 = data2['Outcome']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=33)

In [None]:
X1_test

In [None]:
svm_ub = SVC(kernel='linear')
svm_ub.fit(X1_train, y1_train)
svm_ub_pred = svm_ub.predict(X1_test)

In [None]:
svm_ub_acc = accuracy_score(y1_test, svm_ub_pred)
svm_ub_prec = precision_score(y1_test, svm_ub_pred)
svm_ub_recall = recall_score(y1_test, svm_ub_pred)
svm_ub_f1 = f1_score(y1_test, svm_ub_pred)
svm_ub_confMatx = confusion_matrix(y1_test, svm_ub_pred)

print('For Unbalanced Dataset')
print("SVM Accuracy:", svm_ub_acc)
print("SVM Precision:", svm_ub_prec)
print("SVM Recall:", svm_ub_recall)
print("SVM F1-Score:", svm_ub_f1)
print("SVM Confusion Matrix:\n", svm_ub_confMatx)

In [None]:
rf_ub = RandomForestClassifier(random_state=33)
rf_ub.fit(X_train, y_train)
rf_ub_pred = rf_ub.predict(X_test)

In [None]:
rf_ub_acc = accuracy_score(y1_test, rf_ub_pred)
rf_ub_prec = precision_score(y1_test, rf_ub_pred)
rf_ub_recall = recall_score(y1_test, rf_ub_pred)
rf_ub_f1 = f1_score(y1_test, rf_ub_pred)
rf_ub_confMatx = confusion_matrix(y1_test, rf_ub_pred)

print('For Unbalanced Dataset')
print("Random Forest Accuracy:", rf_ub_acc)
print("Random Forest Precision:", rf_ub_prec)
print("Random Forest Recall:", rf_ub_recall)
print("Random Forest F1-Score:", rf_ub_f1)
print("Random Forest Confusion Matrix:\n", rf_ub_confMatx)

# Comparison

In [None]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Model", "Dataset", "Accuracy", "Precision", "Recall", "F1-Score"]

table.add_row(["SVM", "Balanced", svm_accuracy, svm_precision, svm_recall, svm_f1])
table.add_row(["SVM", "Unbalanced", svm_ub_acc, svm_ub_prec, svm_ub_recall, svm_ub_f1])
table.add_row(["Random Forest", "Balanced", rf_accuracy, rf_precision, rf_recall, rf_f1])
table.add_row(["Random Forest", "Unbalanced", rf_ub_acc, rf_ub_prec, rf_ub_recall, rf_ub_f1])

print(table)

## Balanced Dataset:
In a balanced dataset, the class distribution is adjusted to have roughly equal instances of each class. Imagine a seesaw where both sides carry equal weight. Models trained on balanced data tend to perform better because they learn from a more representative sample of the minority class. Let's break down the observations:

1. **Support Vector Machine (SVM)**:
   - Accuracy: 0.74
   - Precision: 0.78
   - Recall: 0.67
   - F1-Score: 0.72
   - Confusion Matrix:
     ```
     [[82 19]
      [33 66]]
     ```
   - The SVM model achieves decent overall performance. It correctly identifies both positive and negative instances, but there's room for improvement.

2. **Random Forest**:
   - Accuracy: 0.8
   - Precision: 0.79
   - Recall: 0.82
   - F1-Score: 0.80
   - Confusion Matrix:
     ```
     [[79 22]
      [18 81]]
     ```
   - The Random Forest model shines here! It achieves higher recall, meaning it captures more true positive cases.

## Unbalanced Dataset:
For unbalanced dataset, here one class dominates the other.

1. **SVM (Unbalanced)**:
   - Accuracy: 0.75
   - Precision: 0.74
   - Recall: 0.45
   - F1-Score: 0.56
   - Confusion Matrix:
     ```
     [[90  9]
      [30 25]]
     ```
   - The SVM model struggles. It achieves high precision (few false positives), but recall drops significantly. It misses many positive instances.

2. **Random Forest (Unbalanced)**:
   - Accuracy: 0.75
   - Precision: 0.69
   - Recall: 0.56
   - F1-Score: 0.62
   - Confusion Matrix:
     ```
     [[85 14]
      [24 31]]
     ```
   - The Random Forest also faces challenges. It maintains decent precision but sacrifices recall.

## Insights:
- Here **SMOTE** (Synthetic Minority Over-sampling Technique) has improved performance on unbalanced data.
- Balanced datasets lead to better generalization, but unbalanced ones make models biased toward the majority class.