In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
#a. Load and explore data
df = pd.read_csv('dataR2.csv')
df

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.500000,70,2.707,0.467409,8.8071,9.702400,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.124670,91,4.498,1.009651,17.9393,22.432040,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.169560,12.76600,928.220,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.819240,10.57635,773.920,1
...,...,...,...,...,...,...,...,...,...,...
111,45,26.850000,92,3.330,0.755688,54.6800,12.100000,10.96000,268.230,2
112,62,26.840000,100,4.530,1.117400,12.4500,21.420000,7.32000,330.160,2
113,65,32.050000,97,5.730,1.370998,61.4800,22.540000,10.33000,314.050,2
114,72,25.590000,82,2.820,0.570392,24.9600,33.750000,3.27000,392.460,2


In [3]:
df.dtypes

Age                 int64
BMI               float64
Glucose             int64
Insulin           float64
HOMA              float64
Leptin            float64
Adiponectin       float64
Resistin          float64
MCP.1             float64
Classification      int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,1.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,1.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474283,6.881763,269.97825,1.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,2.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,2.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,2.0


In [5]:
df.corr()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
Age,1.0,0.00853,0.230106,0.032495,0.127033,0.102626,-0.219813,0.002742,0.013462,-0.043555
BMI,0.00853,1.0,0.138845,0.145295,0.11448,0.569593,-0.302735,0.19535,0.224038,-0.132586
Glucose,0.230106,0.138845,1.0,0.504653,0.696212,0.30508,-0.122121,0.291327,0.264879,0.384315
Insulin,0.032495,0.145295,0.504653,1.0,0.932198,0.301462,-0.031296,0.146731,0.174356,0.276804
HOMA,0.127033,0.11448,0.696212,0.932198,1.0,0.32721,-0.056337,0.231101,0.259529,0.284012
Leptin,0.102626,0.569593,0.30508,0.301462,0.32721,1.0,-0.095389,0.256234,0.014009,-0.001078
Adiponectin,-0.219813,-0.302735,-0.122121,-0.031296,-0.056337,-0.095389,1.0,-0.252363,-0.200694,-0.01949
Resistin,0.002742,0.19535,0.291327,0.146731,0.231101,0.256234,-0.252363,1.0,0.366474,0.22731
MCP.1,0.013462,0.224038,0.264879,0.174356,0.259529,0.014009,-0.200694,0.366474,1.0,0.091381
Classification,-0.043555,-0.132586,0.384315,0.276804,0.284012,-0.001078,-0.01949,0.22731,0.091381,1.0


In [7]:
#Data Preprocessing
df.isna().sum()

Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64

In [8]:
#Converting X,y as numpy arrays
features = df[['Age','BMI','Glucose','Insulin','HOMA','Leptin','Adiponectin','Resistin','MCP.1']]
X = np.asarray(features)
X[0:5]

array([[4.80000000e+01, 2.35000000e+01, 7.00000000e+01, 2.70700000e+00,
        4.67408667e-01, 8.80710000e+00, 9.70240000e+00, 7.99585000e+00,
        4.17114000e+02],
       [8.30000000e+01, 2.06904945e+01, 9.20000000e+01, 3.11500000e+00,
        7.06897333e-01, 8.84380000e+00, 5.42928500e+00, 4.06405000e+00,
        4.68786000e+02],
       [8.20000000e+01, 2.31246704e+01, 9.10000000e+01, 4.49800000e+00,
        1.00965107e+00, 1.79393000e+01, 2.24320400e+01, 9.27715000e+00,
        5.54697000e+02],
       [6.80000000e+01, 2.13675214e+01, 7.70000000e+01, 3.22600000e+00,
        6.12724933e-01, 9.88270000e+00, 7.16956000e+00, 1.27660000e+01,
        9.28220000e+02],
       [8.60000000e+01, 2.11111111e+01, 9.20000000e+01, 3.54900000e+00,
        8.05386400e-01, 6.69940000e+00, 4.81924000e+00, 1.05763500e+01,
        7.73920000e+02]])

In [9]:
y = np.asarray(df['Classification'])
y[0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [10]:
#Split Data into Training and Testing
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (92, 9) (92,)
Test set: (24, 9) (24,)


In [11]:
#Modelling
'''
The SVM algorithm offers a choice of kernel functions for performing its processing. Basically, mapping data into a higher 
dimensional space is called kernelling. The mathematical function used for the transformation is known as the kernel function, 
and can be of different types, such as:

1.Linear
2.Polynomial
3.Radial basis function (RBF)
4.Sigmoid
Each of these functions has its characteristics, its pros and cons, and its equation, but as there's no easy way of 
knowing which function performs best with any given dataset, we usually choose different functions in turn and compare 
the results. Let's just use the default, RBF (Radial Basis Function) for this lab.
'''

from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train) 

SVC()

In [12]:
#Predict new values using xtest
yhat = clf.predict(X_test)
yhat [0:5]

array([2, 2, 2, 2, 2], dtype=int64)

In [13]:
#accuracy and classification metrics
from sklearn.metrics import classification_report
print (classification_report(y_test, yhat))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.42      1.00      0.59        10

    accuracy                           0.42        24
   macro avg       0.21      0.50      0.29        24
weighted avg       0.17      0.42      0.25        24



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
