<a href="https://colab.research.google.com/github/SANGRAMLEMBE/MTech/blob/main/Machine_Learning_Algorithm/Practical/SVM_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SVM_assignment


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score

## Step 1: Load the dataset

In [2]:
df = pd.read_csv('/content/Diabates_dirty.csv')
print(df)

     Age        BMI  Blood_Pressure Cholesterol  Diabetes
0   44.0  19.478891              95         250         0
1   54.0  28.807859              87         155         0
2   60.0  22.253709              83         175         1
3   49.0  12.704033              83         213         1
4   36.0  25.287065             135         208         1
..   ...        ...             ...         ...       ...
95  47.0  29.573028             113         163         0
96  27.0  14.125587              87         209         0
97  60.0  35.160713             119         179         0
98  58.0  16.179365             162         184         1
99  20.0  32.499443             121         154         1

[100 rows x 5 columns]


## Step 2: Data Inspection and Preprocessing

In [3]:
print("Initial Dataset Info:")
df.info()

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             99 non-null     float64
 1   BMI             99 non-null     float64
 2   Blood_Pressure  100 non-null    int64  
 3   Cholesterol     100 non-null    object 
 4   Diabetes        100 non-null    int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 4.0+ KB


## Check for missing values

In [4]:
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
Age               1
BMI               1
Blood_Pressure    0
Cholesterol       0
Diabetes          0
dtype: int64


In [5]:
print("\n--- Converting non-numeric strings to numbers ---")

for col in df.drop('Diabetes', axis=1).columns:
    # Use pd.to_numeric with errors='coerce'.
    # This will turn any string that can't be converted into a number into NaN.
    df[col] = pd.to_numeric(df[col], errors='coerce')


--- Converting non-numeric strings to numbers ---


In [6]:
# Now, we handle the NaNs that were created and any that were there before
print("Handling missing values created from string conversion...")
for col in df.columns:
    if df[col].isnull().any():
        # Fill NaN values with the mean of the column
        mean_val = df[col].mean()
        df[col] = df[col].fillna(mean_val)
        print(f"Filled missing values in '{col}'.")

Handling missing values created from string conversion...
Filled missing values in 'Age'.
Filled missing values in 'BMI'.
Filled missing values in 'Cholesterol'.


## Step 2: Define Features (X) and Target (y)

In [7]:

X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

## Step 3: Split the data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 4: Scale the features

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Step 5: Initialize and train the models

In [10]:
# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)


In [11]:
# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

In [12]:
# Support Vector Machine (SVM)
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)


## Step 6: Evaluate the models


In [13]:
# Logistic Regression Metrics
acc_log_reg = accuracy_score(y_test, y_pred_log_reg)
prec_log_reg = precision_score(y_test, y_pred_log_reg)
rec_log_reg = recall_score(y_test, y_pred_log_reg)

In [14]:
# KNN Metrics
acc_knn = accuracy_score(y_test, y_pred_knn)
prec_knn = precision_score(y_test, y_pred_knn)
rec_knn = recall_score(y_test, y_pred_knn)

In [15]:
# SVM Metrics
acc_svm = accuracy_score(y_test, y_pred_svm)
prec_svm = precision_score(y_test, y_pred_svm)
rec_svm = recall_score(y_test, y_pred_svm)

## Step 7: Compare the results

In [16]:

results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-Nearest Neighbors (KNN)', 'Support Vector Machine (SVM)'],
    'Accuracy': [acc_log_reg, acc_knn, acc_svm],
    'Precision': [prec_log_reg, prec_knn, prec_svm],
    'Recall': [rec_log_reg, rec_knn, rec_svm]
})

In [17]:
print(results_df)


                          Model  Accuracy  Precision    Recall
0           Logistic Regression      0.45      0.500  0.272727
1     K-Nearest Neighbors (KNN)      0.55      0.625  0.454545
2  Support Vector Machine (SVM)      0.45      0.500  0.272727
