In [2]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

# Display the first 5 rows of the dataset
print("First 5 rows of the dataset:\n", diabetes_dataset.head())

# Display dataset shape
print("\nDataset shape:", diabetes_dataset.shape)

# Display statistical details
print("\nDescriptive statistics:\n", diabetes_dataset.describe())

# Check how many people are diabetic (Outcome = 1) or not (Outcome = 0)
print("\nOutcome value counts:\n", diabetes_dataset['Outcome'].value_counts())

# Mean values per class
print("\nMean values per Outcome group:\n", diabetes_dataset.groupby('Outcome').mean())

# Split the data into features and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=2
)

# Display shapes of the splits
print("\nX shape:", X.shape)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# Train the Support Vector Machine (SVM) classifier with linear kernel
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

# Evaluate accuracy on training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print("\nTraining data accuracy:", training_data_accuracy)

# Evaluate accuracy on test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print("Test data accuracy:", test_data_accuracy)

# Make a prediction on new data
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)

# Convert input to numpy array and reshape for prediction
input_data_as_numpy_array = np.asarray(input_data).reshape(1, -1)

# Standardize the input data
std_data = scaler.transform(input_data_as_numpy_array)

# Predict using the trained model
prediction = classifier.predict(std_data)

# Output prediction result
print("\nPrediction for input data:")
if prediction[0] == 0:
    print('✅ The person is **not diabetic**')
else:
    print('⚠️ The person is **diabetic**')


First 5 rows of the dataset:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset shape: (768, 9)

Descriptive statistics:
        Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.7994

