# ***SVM Model***

## Import essential libraries

In [None]:
# for data analysis
import numpy as np 
import pandas as pd


# to visualize data
from matplotlib import pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

# to split the dataset
from sklearn.model_selection import train_test_split

# to access the svm model
from sklearn import svm

# to test the accuracy of the model
from sklearn.metrics import accuracy_score,confusion_matrix

%matplotlib inline


## Read data from the CSV file

In [None]:
# load dataset from the google drive
url='https://drive.google.com/file/d/1yFKIb7GCgKPWR-g7q8DZTLzXOCVxd99n/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
dataset_df=pd.read_csv(url)

In [None]:
# get the shape of the dataset
rows,cols=dataset_df.shape

print('No of rows = ',rows)
print('No of columns = ',cols)


No of rows =  840
No of columns =  24


In [None]:
# first 10 data records
dataset_df.head(10)

Unnamed: 0,Sex,Age,Living_Area,Height/m,Weight/kg,vaccine_type,blood_group,Swelling,Redness,Itching,...,Coughing,Diarrhea,Nausea_and_vomiting,Breathlessness,Joint_Pain,Fainted,Anaphylactic_Reaction,Tingling,Swollen_Lymph_Nodes,Symptoms_time
0,Male,51,Downtown,1.71,82,Sinopharm,A+,No,No,No,...,No,No,No,No,No,No,No,No,No,
1,Female,45,Downtown,1.47,55,Astrazeneca,B+,No,No,No,...,No,No,No,No,No,No,No,No,No,
2,Male,56,Downtown,1.51,59,Sinopharm,B-,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours
3,Male,27,Downtown,1.6,90,Astrazeneca,AB-,No,No,No,...,No,No,No,No,No,No,No,No,No,
4,Female,31,Downtown,1.48,83,Sinopharm,AB+,No,No,No,...,Yes,No,No,No,No,No,No,No,No,< 24 hours
5,Male,48,Downtown,1.61,59,Moderna,O-,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours
6,Female,49,Downtown,1.54,59,Moderna,A+,No,No,No,...,No,No,No,No,No,No,No,No,No,
7,Female,57,Downtown,1.65,74,Moderna,AB+,No,No,No,...,No,No,No,No,No,No,No,No,No,
8,Female,36,Downtown,1.87,89,Sinopharm,B-,No,No,No,...,No,No,No,No,No,No,No,No,No,24-72 hours
9,Male,54,Downtown,1.61,97,Sinopharm,O+,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours


In [None]:
# last 10 data records
dataset_df.tail(10)

Unnamed: 0,Sex,Age,Living_Area,Height/m,Weight/kg,vaccine_type,blood_group,Swelling,Redness,Itching,...,Coughing,Diarrhea,Nausea_and_vomiting,Breathlessness,Joint_Pain,Fainted,Anaphylactic_Reaction,Tingling,Swollen_Lymph_Nodes,Symptoms_time
830,Female,30,Downtown,1.59,91,Sinopharm,AB-,No,No,No,...,No,No,No,No,No,No,No,No,No,24-72 hours
831,Male,27,Downtown,1.65,86,Moderna,AB+,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours
832,Male,32,Downtown,1.84,108,Moderna,O-,No,No,No,...,No,No,No,No,No,No,No,No,No,
833,Female,30,Downtown,1.72,73,Pfizer,A-,No,No,No,...,Yes,No,No,No,No,No,No,No,No,
834,Male,27,Downtown,1.83,86,Sinopharm,B+,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours
835,Male,30,Downtown,1.58,43,Astrazeneca,B+,No,No,No,...,No,No,No,No,No,No,No,No,No,24-72 hours
836,Male,30,Downtown,1.61,59,Astrazeneca,A-,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours
837,Female,29,Downtown,1.64,58,Pfizer,A+,No,No,No,...,No,No,No,No,No,No,No,No,No,< 24 hours
838,Female,28,Downtown,1.65,82,Pfizer,AB+,No,No,No,...,Yes,No,No,No,No,No,No,No,No,24-72 hours
839,Male,26,Downtown,1.55,86,Moderna,AB-,No,No,No,...,No,No,No,No,No,No,No,No,No,


In [None]:
# get some information about the dataset
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Sex                    840 non-null    object 
 1   Age                    840 non-null    int64  
 2   Living_Area            840 non-null    object 
 3   Height/m               840 non-null    float64
 4   Weight/kg              840 non-null    int64  
 5   vaccine_type           840 non-null    object 
 6   blood_group            840 non-null    object 
 7   Swelling               840 non-null    object 
 8   Redness                840 non-null    object 
 9   Itching                840 non-null    object 
 10  Fever                  840 non-null    object 
 11  Headache               840 non-null    object 
 12  Muscle_Pain            840 non-null    object 
 13  Tiredness              840 non-null    object 
 14  Coughing               840 non-null    object 
 15  Diarrh

In [None]:
# check for the null values
dataset_df.isnull().sum()

Sex                      0
Age                      0
Living_Area              0
Height/m                 0
Weight/kg                0
vaccine_type             0
blood_group              0
Swelling                 0
Redness                  0
Itching                  0
Fever                    0
Headache                 0
Muscle_Pain              0
Tiredness                0
Coughing                 0
Diarrhea                 0
Nausea_and_vomiting      0
Breathlessness           0
Joint_Pain               0
Fainted                  0
Anaphylactic_Reaction    0
Tingling                 0
Swollen_Lymph_Nodes      0
Symptoms_time            0
dtype: int64

## Statistical Analysis

In [None]:
# statistical analysis on numerical data
dataset_df.describe()

Unnamed: 0,Age,Height/m,Weight/kg
count,840.0,840.0,840.0
mean,36.035714,1.65344,67.425
std,9.907013,0.119138,15.019625
min,18.0,0.48,36.0
25%,28.0,1.57,56.0
50%,33.0,1.65,68.0
75%,42.0,1.74,80.0
max,67.0,1.9,109.0


## Data Pre-processing


#### Format Features

In [None]:
# Gender
# Male - 1 Female -0
dataset_df.Sex=dataset_df.Sex.map({'Male':1,'Female':0})
dataset_df['Sex'].value_counts()

0    570
1    270
Name: Sex, dtype: int64

In [None]:
# Living Area
# DownTown -1 Outskirt -0
dataset_df.Living_Area=dataset_df.Living_Area.map({'Downtown':1,'Outskirt':0})
dataset_df['Living_Area'].value_counts()

1    594
0    246
Name: Living_Area, dtype: int64

In [None]:
# Vaccine Type
# 'Pfizer':0  'Moderna':1  'Sinopharm':2  'Astrazeneca':3
dataset_df.vaccine_type=dataset_df.vaccine_type.map({'Pfizer':0,'Moderna':1,'Sinopharm':2,'Astrazeneca':3})
dataset_df['vaccine_type'].value_counts()

2    356
0    186
1    158
3    140
Name: vaccine_type, dtype: int64

In [None]:
# Blood group
# A+ :0 A- :1 AB+ :2 AB- :3 B+ :4 B- :5   O+ :6 O- :7
dataset_df.blood_group=dataset_df.blood_group .map({'A+':0,'A-':1,'AB+':2,'AB-':3,'B+':4,'B-':5,'O+':6,'O-':7})
dataset_df['blood_group'].value_counts()

3    135
4    120
2    120
1    120
0    105
7     90
5     75
6     75
Name: blood_group, dtype: int64

### Format Targets

In [None]:
# get the side effects and store them in an array
sideEffects=dataset_df.columns[7:23].tolist()
print(sideEffects)
print('size = ' ,len(sideEffects))

['Swelling', 'Redness', 'Itching', 'Fever', 'Headache', 'Muscle_Pain', 'Tiredness', 'Coughing', 'Diarrhea', 'Nausea_and_vomiting', 'Breathlessness', 'Joint_Pain', 'Fainted', 'Anaphylactic_Reaction', 'Tingling', 'Swollen_Lymph_Nodes']
size =  16


In [None]:
# a loop to fromat side effets
for index in range(0,len(sideEffects)):
  name=sideEffects[index]
  dataset_df[name]=dataset_df[name].map({'Yes':1,'No':0})
  dataset_df[name].value_counts()


In [None]:
# a loop to view formated side effects
for index in range(0,len(sideEffects)):
  print(dataset_df[sideEffects[index]].value_counts())
  print('\n')

0    763
1     77
Name: Swelling, dtype: int64


0    823
1     17
Name: Redness, dtype: int64


0    840
Name: Itching, dtype: int64


0    827
1     13
Name: Fever, dtype: int64


0    654
1    186
Name: Headache, dtype: int64


0    507
1    333
Name: Muscle_Pain, dtype: int64


0    539
1    301
Name: Tiredness, dtype: int64


0    774
1     66
Name: Coughing, dtype: int64


0    814
1     26
Name: Diarrhea, dtype: int64


0    827
1     13
Name: Nausea_and_vomiting, dtype: int64


0    829
1     11
Name: Breathlessness, dtype: int64


0    827
1     13
Name: Joint_Pain, dtype: int64


0    839
1      1
Name: Fainted, dtype: int64


0    837
1      3
Name: Anaphylactic_Reaction, dtype: int64


0    782
1     58
Name: Tingling, dtype: int64


0    836
1      4
Name: Swollen_Lymph_Nodes, dtype: int64




## Model Training

In [None]:
# features of the dataset
features=dataset_df.iloc[:,0:7]
X=features
print(X)

     Sex  Age  Living_Area  Height/m  Weight/kg  vaccine_type  blood_group
0      1   51            1      1.71         82             2            0
1      0   45            1      1.47         55             3            4
2      1   56            1      1.51         59             2            5
3      1   27            1      1.60         90             3            3
4      0   31            1      1.48         83             2            2
..   ...  ...          ...       ...        ...           ...          ...
835    1   30            1      1.58         43             3            4
836    1   30            1      1.61         59             3            1
837    0   29            1      1.64         58             0            0
838    0   28            1      1.65         82             0            2
839    1   26            1      1.55         86             1            3

[840 rows x 7 columns]


In [None]:


# function to get predictions on side effects
# One side effect per time

def get_predictions(Y,X,input):

  X=X.values

  # split the dataset 80% train data , 20% test data
  X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)
  #print(X.shape,X_train.shape,X_test.shape)

  # Get the model
  model=svm.SVC(kernel='linear')
  # train the svm model with train data
  model.fit(X_train,Y_train)

  # get the predtions for the user inputs
  output=model.predict(input)
  return output


# a function to get model accuracy on a particular side effect
def get_model_accuracy(Y):

  # split the dataset 80% train data , 20% test data
  X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

  # Get the model
  model=svm.SVC(kernel='linear')
  # train the svm model with train data
  model.fit(X_train,Y_train)
  
  #Accuracy of the trainning dataset
  Y_train_prediction=model.predict(X_train)
  training_data_accuracy=accuracy_score(Y_train_prediction,Y_train)
  print('Accuracy on trainning data: ',training_data_accuracy)

  #Accuracy on Test data
  Y_test_prediction=model.predict(X_test)
  test_data_accuracy=accuracy_score(Y_test_prediction,Y_test)
  print('Accuracy on test data: ',test_data_accuracy)


# model accuracy using confusing mtarix
def accuracy_confusion_matrix(Y):

  # split the dataset 80% train data , 20% test data
  X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

  # Get the model
  model=svm.SVC(kernel='linear')
  # train the svm model with train data
  model.fit(X_train,Y_train)
    

  #Accuracy on Test data
  Y_test_prediction=model.predict(X_test)
  test_data_accuracy=confusion_matrix(Y_test_prediction,Y_test)
  print(test_data_accuracy)

  

In [None]:
# predictions on swelling
target=dataset_df.iloc[:,7]
Y1=target
#print(Y1)
#print('\n')

# example on user inputs
input=[[1,51,1,1.71,82,2,0]]

output=get_predictions(Y1,X,input)

if(output==0):
  result='No'
else:
  result='Yes'

print('Swelling = ',result)


Swelling =  No


In [None]:
# model accuracy on swelling
get_model_accuracy(Y1)

Accuracy on trainning data:  0.90625
Accuracy on test data:  0.9166666666666666


## Predictions on all the side effects

## Accuracies on all side effects

In [None]:
# accuracies on all side effects (using accuracy scores)
index=0
for count in range(7,23):
  if(sideEffects[index]!='Itching'):  # to avoid column 'Itching'
    Y=dataset_df.iloc[:,count]
    print('Accuracy on '+sideEffects[index])
    get_model_accuracy(Y)
    print('\n')

  index=index+1

Accuracy on Swelling
Accuracy on trainning data:  0.90625
Accuracy on test data:  0.9166666666666666


Accuracy on Redness
Accuracy on trainning data:  0.9776785714285714
Accuracy on test data:  0.9880952380952381


Accuracy on Fever
Accuracy on trainning data:  0.9821428571428571
Accuracy on test data:  0.9940476190476191


Accuracy on Headache
Accuracy on trainning data:  0.78125
Accuracy on test data:  0.7678571428571429


Accuracy on Muscle_Pain
Accuracy on trainning data:  0.6101190476190477
Accuracy on test data:  0.5773809523809523


Accuracy on Tiredness
Accuracy on trainning data:  0.6517857142857143
Accuracy on test data:  0.6011904761904762


Accuracy on Coughing
Accuracy on trainning data:  0.9255952380952381
Accuracy on test data:  0.9047619047619048


Accuracy on Diarrhea
Accuracy on trainning data:  0.9672619047619048
Accuracy on test data:  0.9761904761904762


Accuracy on Nausea_and_vomiting
Accuracy on trainning data:  0.9821428571428571
Accuracy on test data:  0.9940

## An example

In [None]:
# predictions on all side effects

# example on user inputs
input=[[1,51,1,1.71,82,2,0]]

# an array to store possible side effects
possible_side_effects=[]

index=0
for count in range(7,23):
  if(sideEffects[index]!='Itching'):  # to avoid column 'Itching'
    Y=dataset_df.iloc[:,count]
    output=get_predictions(Y,X,input)
    if(output==0):
      result='No'
    else:
      result='Yes'
      possible_side_effects.append(sideEffects[index])

    #print(sideEffects[index]+' = ',result)
  index=index+1

print('Possible Sideeffects =',possible_side_effects)

risk=(len(possible_side_effects)/15)*100
safe=100-risk

print('Risk =',risk)
print('Safe = ',safe)


Possible Sideeffects = []
Risk = 0.0
Safe =  100.0
