## Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score

## Feature Scaling is important because

(standardization is a type of feature scaling.)

If you don’t standardize, features with large values will dominate those with small values — biasing the model.

Many algorithms assume standardized input like -> KNN, SVM, Logistic Regression, etc.

Standardizing helps gradient-based algorithms (like linear regression, neural networks) converge faster and more smoothly, because features are on a similar scale.

## Data Collection and Analysis

In [2]:
diabetes_data = pd.read_csv("diabetes.csv")

In [3]:
pd.read_csv?
# This gives details of the function

[31mSignature:[39m
pd.read_csv(
    filepath_or_buffer: [33m'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]'[39m,
    *,
    sep: [33m'str | None | lib.NoDefault'[39m = <no_default>,
    delimiter: [33m'str | None | lib.NoDefault'[39m = [38;5;28;01mNone[39;00m,
    header: [33m"int | Sequence[int] | None | Literal['infer']"[39m = [33m'infer'[39m,
    names: [33m'Sequence[Hashable] | None | lib.NoDefault'[39m = <no_default>,
    index_col: [33m'IndexLabel | Literal[False] | None'[39m = [38;5;28;01mNone[39;00m,
    usecols: [33m'UsecolsArgType'[39m = [38;5;28;01mNone[39;00m,
    dtype: [33m'DtypeArg | None'[39m = [38;5;28;01mNone[39;00m,
    engine: [33m'CSVEngine | None'[39m = [38;5;28;01mNone[39;00m,
    converters: [33m'Mapping[Hashable, Callable] | None'[39m = [38;5;28;01mNone[39;00m,
    true_values: [33m'list | None'[39m = [38;5;28;01mNone[39;00m,
    false_values: [33m'list | None'[39m = [38;5;28;01mNone[39;00m,
    skipinitialspac

In [4]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
diabetes_data.shape

(768, 9)

In [6]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
diabetes_data["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [8]:
diabetes_data.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


## Seperating Data and Labels

In [9]:
X = diabetes_data.drop(columns = "Outcome", axis = 1)
Y = diabetes_data["Outcome"].copy()

## Train Test Split

In [11]:
# X_train,X_test,Y_train,Y_test =  train_test_split(X_scaled, Y, test_size = 0.2, stratify = Y, random_state = 2)
X_train,X_test,Y_train,Y_test =  train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

## Data Standardization

In [13]:
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# Or we can just write..
# X_scaled = StandardScaler().fit_transform(X)

scaler = StandardScaler()
scaler.fit(X_train)  

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [14]:
X_train_scaled

array([[-1.13796489, -0.07971099, -3.5556072 , ...,  0.02825037,
        -0.98159708, -0.7885233 ],
       [ 0.64067858, -0.52091877,  0.02549599, ..., -0.17184452,
        -1.03823795,  0.31879426],
       [-0.84152431,  2.12632792, -0.48609018, ..., -0.25938604,
        -0.21545477,  2.19271628],
       ...,
       [ 2.12288146, -1.15121561,  0.23013046, ..., -0.25938604,
        -0.50760242,  0.14843771],
       [ 0.04779742, -0.30031488,  0.43476492, ...,  0.90366551,
        -0.69839272,  0.40397253],
       [-1.13796489, -1.11970076, -0.07682125, ...,  0.45345201,
        -0.69243053, -0.70334503]], shape=(614, 8))

In [17]:
print(X_train_scaled.shape, X_test_scaled.shape)

(614, 8) (154, 8)


## Training the model

Classifiers are machine learning models or algorithms used to categorize data into classes or labels.

A classifier answers the question:

"Given this input, which category (class) does it belong to?"

 Examples of classifiers:
 
Classifier	Example Use Case

LogisticRegression -	Spam vs. not spam email

KNeighborsClassifier -	Recognizing handwritten digits

DecisionTreeClassifier -	Predicting if a patient has diabetes

RandomForestClassifier -	Classifying images or customer churn

SVC (Support Vector Classifier) -	Face recognition

NaiveBayes	Sentiment analysis - (positive/negative)

In [18]:
classifier = svm.SVC(kernel='linear')

In [19]:
# training the svm Classifier
classifier.fit(X_train_scaled,Y_train)

The reason .transform() is not used is because SVC (Support Vector Classifier) is a model, not a transformer.

.fit() + .transform() -> Used with transformers (like StandardScaler, PCA) and Learns something and modifies the input data

.fit() + .predict() -> Used with models/classifiers (like SVC, LogisticRegression) and Learns from the data and makes predictions

## Model Evaluation

In [20]:
X_train_prediction = classifier.predict(X_train_scaled)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [21]:
training_data_accuracy

0.7866449511400652

In [22]:
X_test_prediction = classifier.predict(X_test_scaled)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [23]:
testing_data_accuracy

0.7727272727272727

## Making Predictions

In [33]:
# input_data = (6,148,72,35,0,33.6,0.627,50)

# feature = np.asarray([input_data])

# # standarlize the data
# std_feature = scaler.transform(feature)

# prediction = classifier.predict(std_feature)
# prediction

# This gives warning so we use this (Chat-Gpt)

# That warning means your StandardScaler was originally fitted 
# on a DataFrame with column names, but now you're passing it a NumPy array
# without column names.

#You're only transforming a single row. 
# If you know the order of features is correct, it's fine to ignore this warning.

# If you want to avoid the warning, wrap your input in a DataFrame with the same column names:

input_data = (1,85,66,29,0,26.6,0.351,31)
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

feature_df = pd.DataFrame([input_data], columns=columns)
std_feature = scaler.transform(feature_df)
prediction = classifier.predict(std_feature)

# This ensures scaler.transform() gets the same kind of input it was trained on.

if(prediction[0] == 1):
    print("Diabetic")
else : print("Non-Diabetic")

Non-Diabetic


## Why SVM 

We don’t always know in advance that SVM (Support Vector Machine) is the best model — we usually try multiple algorithms and compare their performance. The choice depends on the type of problem and data characteristics.

 Common reasons to try SVM:
 
Works well for binary classification (like diabetic vs. not diabetic).

Effective in high-dimensional spaces (many features).

Good with small to medium-sized datasets.

Supports different kernels (linear, RBF, etc.) to handle complex relationships.

## Saving the trained model

In [25]:
import pickle

In [29]:
filename = "trained_model.sav"
pickle.dump(classifier, open(filename,'wb'))
pickle.dump(scaler,open("scaler.sav","wb"))

## Loading the model

In [31]:
loaded_model = pickle.load(open('trained_model.sav','rb'))
loaded_scaler = pickle.load(open('scaler.sav','rb'))

## Predicting from model

In [32]:
input_data = (1,85,66,29,0,26.6,0.351,31)
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

feature_df = pd.DataFrame([input_data], columns=columns)
std_feature = loaded_scaler.transform(feature_df)
prediction = loaded_model.predict(std_feature)

# This ensures scaler.transform() gets the same kind of input it was trained on.

if(prediction[0] == 1):
    print("Diabetic")
else : print("Non-Diabetic")

Non-Diabetic
