In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

%matplotlib inline

# Load the diabetes dataset from CSV file
diabetes_dataset = pd.read_csv('diabetes.csv')

# Print statistical summary of the dataset
diabetes_dataset.describe()

# Count missing values in the dataset
sum(diabetes_dataset.isnull().sum())

# Count how many zero values are present in certain columns
print((diabetes_dataset[['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]== 0).sum())

# Replace the zero values with NaN
diabetes_dataset[['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] = diabetes_dataset[['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].replace(0,np.NAN)

# Count missing values after replacing zero values with NaN
print(diabetes_dataset.isnull().sum())

# Fill the missing values with the mean of the corresponding column
diabetes_dataset.fillna(diabetes_dataset.mean(), inplace = True)

# Count missing values after filling missing values with mean
print(diabetes_dataset.isnull().sum())

# Create a copy of the dataset for transforming data
dataTransform = diabetes_dataset.copy()

# Encode categorical data as numerical data
labelencoder = LabelEncoder()

for data in diabetes_dataset.columns:
    dataTransform[data] = labelencoder.fit_transform(diabetes_dataset[data])

# Split the dataset into input features (X) and target variable (Y)
X = dataTransform.drop(['Outcome'], axis=1)
Y = dataTransform['Outcome']

# List of input features for reference
diabetes_feature_list = list(X.columns)

# Split the dataset into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2, random_state = 41)

# Initialize an SVM classifier with linear kernel
classifier = SVC(kernel='linear', random_state=0)

# Train the classifier on the training set
classifier.fit(X_train, y_train)

# Predict the target variable on the testing set
y_pred = classifier.predict(X_test)

# Calculate accuracy and confusion matrix of the classifier on the testing set
accuracy = accuracy_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

# Print the accuracy and confusion matrix
print("Accuracy:", accuracy)
print("Confusion matrix:\n", confusion_matrix)


Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Accuracy: 0.7532467532467533
Confusion matrix:
 [[86 13]
 [25 30]]
