In [27]:
#import libraries
#import all libraries and classes
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
#import the dataset
medical_insurance = pd.read_csv(r"C:\Users\Williams\Desktop\datascience\AiPlusInvasion\Data\Medical_insurance_dataset.csv")

In [3]:
#inspect the first five records
medical_insurance.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,not-purchased
1,15810944,Male,35,20000,not-purchased
2,15668575,Female,26,43000,not-purchased
3,15603246,Female,27,57000,not-purchased
4,15804002,Male,19,76000,not-purchased


In [4]:
#check the dimension of the dataset
medical_insurance.shape

(400, 5)

In [5]:
#check statistics of the dataset
medical_insurance.describe()

Unnamed: 0,User ID,Age,EstimatedSalary
count,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5
std,71658.32,10.482877,34096.960282
min,15566690.0,18.0,15000.0
25%,15626760.0,29.75,43000.0
50%,15694340.0,37.0,70000.0
75%,15750360.0,46.0,88000.0
max,15815240.0,60.0,150000.0


In [6]:
#copy the dataset to another variable
medical_insurance_df = medical_insurance.copy()

In [7]:
#check for null columns
medical_insurance_df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [8]:
#inspect the data types after cleaning
medical_insurance_df.dtypes

User ID             int64
Gender             object
Age                 int64
EstimatedSalary     int64
Purchased          object
dtype: object

In [9]:
#instatiate the encoder class
le = LabelEncoder()

In [10]:
#transform the categorical columns
medical_insurance_df["Gender"] = le.fit_transform(medical_insurance_df["Gender"])
medical_insurance_df["Purchased"] = le.fit_transform(medical_insurance_df["Purchased"])

In [11]:
medical_insurance_df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0
...,...,...,...,...,...
395,15691863,0,46,41000,1
396,15706071,1,51,23000,1
397,15654296,0,50,20000,1
398,15755018,1,36,33000,0


In [12]:
#feature selection
X = medical_insurance_df.drop(["User ID", "Purchased"], axis=1)
y = medical_insurance_df["Purchased"]

In [13]:
#scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [15]:
#building the model using KNN
knn = KNeighborsClassifier()

In [16]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [54]:
knn_pred = knn.predict(X_test)

In [55]:
knn_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1])

In [56]:
knn_accuracy = accuracy_score(y_test, knn_pred)

In [57]:
knn_accuracy

0.95

In [58]:
knn_prob = knn.predict_proba(X_test)

In [60]:
from sklearn.metrics import classification_report

In [61]:
knn_classification_report = classification_report(y_test, knn_pred)

In [62]:
print(knn_classification_report)

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        58
           1       0.88      0.95      0.91        22

    accuracy                           0.95        80
   macro avg       0.93      0.95      0.94        80
weighted avg       0.95      0.95      0.95        80



In [63]:
svm = SVC(kernel='linear')

In [64]:
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [65]:
svm_pred = svm.predict(X_test)

In [66]:
svm_accuracy = accuracy_score(y_test, svm_pred)

In [67]:
svm_accuracy

0.9125

In [68]:
svm_classification_report = classification_report(y_test, svm_pred)

In [69]:
print(svm_classification_report)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        58
           1       0.94      0.73      0.82        22

    accuracy                           0.91        80
   macro avg       0.92      0.86      0.88        80
weighted avg       0.91      0.91      0.91        80

