# **Decision Tree Classifier**

### Step 1: Import the necessary libraries

In [1]:
#for data manipulation
import numpy as np
import pandas as pd

#for visualization
import matplotlib.pyplot as plt
import seaborn as sns

#for data modelling
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

#for finding out the accuracy of the model
from sklearn import metrics

### Step 2: Load the dataset

In [2]:
df=pd.read_csv('insurance.csv')

### Step 3: Introductory Insights

Obtain introductory information such as shape of the data, number of rows, number of columns, etc.

In [3]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [4]:
df.shape

(1338, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Step 4: Statistical Insights


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


### Step 5: Data Cleaning

Handling outliers, duplicates and missing values

**Missing Values**

In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

**Duplicate Values**

In [8]:
df.duplicated().sum()

1

In [9]:
dupes = df[df.duplicated(keep='first')]
 
print("Duplicate Rows :")

dupes

Duplicate Rows :


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [10]:
df.drop_duplicates(inplace=True)

In [11]:
# def smokes(row):
#     if row['smoker'] == "yes":
#         return 1 
#     else:
#         return 0
# df['smoker'] = df.apply(smokes,axis=1)
# df


### Step 6:Feature Selection

In [12]:
df=df.drop(['charges'], axis=1)

In [13]:
def gender(row):
    if row['sex'] == "male":
        return 1 
    else:
        return 0
df['sex'] = df.apply(gender,axis=1)
df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,yes,southwest
1,18,1,33.770,1,no,southeast
2,28,1,33.000,3,no,southeast
3,33,1,22.705,0,no,northwest
4,32,1,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,1,30.970,3,no,northwest
1334,18,0,31.920,0,no,northeast
1335,18,0,36.850,0,no,southeast
1336,21,0,25.800,0,no,southwest


In [14]:
def regi(row):
    if row['region'] == "northeast":
        return 1 
    elif row['region'] == "northwest":
        return 2
    elif row['region'] == "southeast":
        return 3
    else:
        return 4
df['region'] = df.apply(regi,axis=1)
df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,yes,4
1,18,1,33.770,1,no,3
2,28,1,33.000,3,no,3
3,33,1,22.705,0,no,2
4,32,1,28.880,0,no,2
...,...,...,...,...,...,...
1333,50,1,30.970,3,no,2
1334,18,0,31.920,0,no,1
1335,18,0,36.850,0,no,3
1336,21,0,25.800,0,no,4


### Step 7: Data Visualisation

### Step 8: Data Modelling

* Train-Test-Split 
* Fit the model to perform predictions
* Using the **Decision Tree Classifier** Algorithm
  - wo
  - re

In [15]:
x=df[['age','sex','bmi','children','region']].values
x[0:5]

array([[19.   ,  0.   , 27.9  ,  0.   ,  4.   ],
       [18.   ,  1.   , 33.77 ,  1.   ,  3.   ],
       [28.   ,  1.   , 33.   ,  3.   ,  3.   ],
       [33.   ,  1.   , 22.705,  0.   ,  2.   ],
       [32.   ,  1.   , 28.88 ,  0.   ,  2.   ]])

In [16]:
y=df['smoker'].values
y[0:5]

array(['yes', 'no', 'no', 'no', 'no'], dtype=object)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3)

In [18]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((935, 5), (935,), (402, 5), (402,))

In [19]:
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [20]:
y_pred=svm_model.predict(x_test)

In [21]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7736318407960199


In [25]:
#X_new = np.array([[3, 2, 1, 0.2], [4.9, 2.2, 3.8, 1.1 ], [5.3, 2.5, 4.6, 1.9 ]])
inp = []

inp_len = int(input('Enter the number of sets you want to predict the class of : '))

for val in range(inp_len):
    print('\nSet ', val+1, ': ')
    ele = [int(input('age : ')), int(input('sex : ')), float(input('bmi : ')),int(input('no of children : ')),int(input('region : '))]
    inp.append(ele)
    

X_new = np.array(inp)

#Prediction of the species from the input vector
Y_pred = svm_model.predict(X_new)

print('\nPredicted Class: ')
for ans in range(inp_len):
  print('Set ', ans+1, ': ', Y_pred)



Set  1 : 

Predicted Class: 
Set  1 :  ['no']


In [None]:
outlook=input('Outlook : ')	
temp=input('Temperature: ')	
humidity=input('Humidity: ')	
wind=input('Wind: ')

df=pd.DataFrame({'outlook':[outlook],'temp':[temp],'humidity':[humidity],'wind':[wind]})

y_pred1=clf.predict(df)

In [None]:
y_pred1

In [None]:

from sklearn.pipeline import Pipeline


In [None]:
def step(df):
    encoder = preprocessing.LabelEncoder()

    for i in df.columns:
        if isinstance(df[i][0], str):
            df[i] = encoder.fit_transform(df[i])

In [None]:
pipe = Pipeline(steps=[
    ('step1',step(df)),
    ('step2',DecisionTreeClassifier(criterion="entropy",  max_depth=5))
])

In [None]:

# Train Decision Tree Classifer
pipe.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = pipe.predict(x_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
outlook=input('Outlook : ')	
temp=input('Temperature: ')	
humidity=input('Humidity: ')	
wind=input('Wind: ')

df=pd.DataFrame({'outlook':[outlook],'temp':[temp],'humidity':[humidity],'wind':[wind]})

y_pred1=pipe.predict(df)
y_pred1