In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler # standardise the data to a common range
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

#**1. Collect Data, and Data Analysis**

In [None]:
data = pd.read_csv('/content/diabetes.csv')

In [None]:
data.head()

**Feature Variables:**
- Pregnancy
- Glucose
- Blood Pressure
- Skin Thickness
- Insulin
- BMI
- Diabetes Pedigree Function	
- Age

In [None]:
data.duplicated()

In [None]:
data.duplicated().sum()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data['Outcome'].value_counts()

In [None]:
round(((data['Outcome'].value_counts())/768) *100, 2)

**Target Variable:**
- 1: diabetic
- 0: non-diabetic

In [None]:
data.groupby('Outcome').mean()

In [None]:
# seperate data and outcome

X = data.drop(columns = 'Outcome', axis = 1) # axis = 1 as I am dropping a column
Y = data['Outcome']

In [None]:
print(X)

In [None]:
X

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

x=data['Glucose']
y=data['Outcome']

plt.scatter(x, y)

In [None]:
x=data['Insulin']
y=data['Outcome']

plt.scatter(x, y,)

In [None]:
x=data['BloodPressure']
y=data['Outcome']

plt.scatter(x, y,)

#**Data Standardization**

In [None]:
scaler = StandardScaler()

In [None]:
stdata = scaler.fit_transform(X)
stdata

#**Data Training**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(stdata, Y, train_size = 0.8, random_state = 2, stratify = Y)

In [None]:
print(X.shape, x_train.shape, x_test.shape)

#**model Training**

In [None]:
clf = svm.SVC(kernel = 'linear')

In [None]:
clf.fit(x_train, y_train)

In [None]:
def scatter(ax,axis,name,title):
  sns.countplot(name,data=data,ax=ax[axis[0]][axis[1]])
  ax[axis[0],axis[1]].set_title(title)

f,ax=plt.subplots(4,2,figsize=(20,15))
plt.suptitle("Scatter plots of features")

features = (((0,0),"Pregnancies","Pregnancy chart feature"),((0,1),"Glucose","Glucose chart feature"),((1,0),"BloodPressure","BloodPressure chart feature"),((1,1),"SkinThickness","SkinThickness chart feature")
,((2,0),"Insulin","Insulin chart feature"),((2,1),"BMI","BMI chart feature"),((3,0),"DiabetesPedigreeFunction","DiabetesPedigreeFunction chart feature"),((3,1),"Age","Age" "chart feature"))

for axis, name, title in features:
  scatter(ax,axis,name,title)

#**Model Evaluation**

In [None]:
X_pred = clf.predict(x_train)

In [None]:
acc = accuracy_score(X_pred, y_train)
acc

In [None]:
test_pred = clf.predict(x_test)
acc2 = accuracy_score(test_pred, y_test)
acc2

Accuracy is not tha bad considering the small amount of data

#**Predictor**

In [None]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = clf.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')