In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('Pima_Indians.csv')
print("Top 10 rows:")
print(df.head())
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print("Column Datatypes:")
print(df.dtypes)
print(df.info())

Top 10 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Rows: 768, Columns: 9
Column Datatypes:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float6

In [4]:
features = ["BMI", "BloodPressure", "Glucose"]

impute = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

processer = ColumnTransformer(transformers = [
    ('num', impute, features)
])

In [5]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_processed = processer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size = 0.2, random_state = 42)

In [6]:
# 1. Training a Gini Index Decision Tree

model_gini = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
model_gini.fit(X_train, y_train)
y_pred = model_gini.predict(X_test)

In [7]:
metrics_gini = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
    "Confusion Matrix": confusion_matrix(y_test, y_pred)
}

for metric,value in metrics_gini.items():
    print(f"{metric}: {value}")

Accuracy: 0.7727272727272727
Precision: 0.7083333333333334
Recall: 0.6181818181818182
F1 Score: 0.6601941747572816
Confusion Matrix: [[85 14]
 [21 34]]


In [8]:
# 2. Training an SVM classifier using different kernels

svm_linear = SVC(kernel = "linear")
svm_linear.fit(X_train, y_train)
y_pred_Linear = svm_linear.predict(X_test)

svm_rbf = SVC(kernel = "rbf")
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)

In [9]:
metrics_linear = {
    "Accuracy": accuracy_score(y_test, y_pred_Linear),
    "Precision": precision_score(y_test, y_pred_Linear),
    "Recall": recall_score(y_test, y_pred_Linear),
    "F1 Score": f1_score(y_test, y_pred_Linear),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_Linear)
}

metrics_rbf = {
    "Accuracy": accuracy_score(y_test, y_pred_rbf),
    "Precision": precision_score(y_test, y_pred_rbf),
    "Recall": recall_score(y_test, y_pred_rbf),
    "F1 Score": f1_score(y_test, y_pred_rbf),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_rbf)
}

for metric,value in metrics_linear.items():
    print(f"{metric}: {value}")

for metric,value in metrics_rbf.items():
    print(f"{metric}: {value}")

Accuracy: 0.7597402597402597
Precision: 0.6875
Recall: 0.6
F1 Score: 0.6407766990291263
Confusion Matrix: [[84 15]
 [22 33]]
Accuracy: 0.7597402597402597
Precision: 0.6956521739130435
Recall: 0.5818181818181818
F1 Score: 0.6336633663366337
Confusion Matrix: [[85 14]
 [23 32]]



As can be observed, the Gini Index Decision Tree gave more accuracy as compared to the Linear
and RBF kernel-based SVMs.
The confusion metrics are very similar.

In this case, there is not much difference between the evaluation metrics of SVM and Decision
Tree.

-> Decision Tree (Gini Index)
Accuracy: 0.7727272727272727
Precision: 0.7083333333333334
Recall: 0.6181818181818182
F1 Score: 0.6601941747572816
Confusion Matrix: [[85 14]
 [21 34]]

-> SVM (Linear)
Accuracy: 0.7597402597402597
Precision: 0.6875
Recall: 0.6
F1 Score: 0.6407766990291263
Confusion Matrix: [[84 15]
 [22 33]]

-> SVM (RBF)
Accuracy: 0.7597402597402597
Precision: 0.6956521739130435
Recall: 0.5818181818181818
F1 Score: 0.6336633663366337
Confusion Matrix: [[85 14]
 [23 32]]
