# **Decision Tree Classifier**

The aim of this project is to determine whether the person is a smoker or not

### Step 1: Import the necessary libraries

In [1]:
#for data manipulation
import numpy as np
import pandas as pd

#for visualization
import matplotlib.pyplot as plt
import seaborn as sns

#for data modelling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

#for finding out the accuracy of the model
from sklearn import metrics

### Step 2: Load the dataset

In [2]:
df=pd.read_csv('/workspaces/MSFT-StudentSummit-2023/data/data.csv')

### Step 3: Introductory Insights

Obtain introductory information such as shape of the data, number of rows, number of columns, etc.

In [3]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [4]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [5]:
df.shape

(1338, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Step 4: Statistical Insights

Obtain information about various statistical data, such as mean, standard deviation, maximum value, and minimum value

In [7]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


### Step 5: Data Cleaning

Handling outliers, duplicates and missing values

**Missing Values**

In [8]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

**Duplicate Values**

In [9]:
df.duplicated().sum()

1

In [10]:
dupes = df[df.duplicated(keep='first')]
 
print("Duplicate Rows :")

dupes

Duplicate Rows :


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(1337, 7)

### Step 6: Feature Selection

In [13]:
df=df.drop(['region','charges'], axis=1)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker
0,19,female,27.9,0,yes
1,18,male,33.77,1,no
2,28,male,33.0,3,no
3,33,male,22.705,0,no
4,32,male,28.88,0,no


In [14]:
def gender(row):
    if row['sex']=='male':
        return 1
    else:
        return 0

In [15]:
df['sex']=df.apply(gender,axis=1)

In [16]:
def smoke(row):
    if row['smoker']=='yes':
        return 1
    else:
        return 0

In [17]:
df['smoker']=df.apply(smoke,axis=1)

### Step 7: Data Visualisation

### Step 8: Data Modelling

* Train-Test-Split 
* Fit the model to perform predictions
* Using the **Decision Tree Classifier** Algorithm
  - wo
  - re

In [18]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [19]:
x

Unnamed: 0,age,sex,bmi,children
0,19,0,27.900,0
1,18,1,33.770,1
2,28,1,33.000,3
3,33,1,22.705,0
4,32,1,28.880,0
...,...,...,...,...
1333,50,1,30.970,3
1334,18,0,31.920,0
1335,18,0,36.850,0
1336,21,0,25.800,0


In [20]:
y

0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    1
Name: smoker, Length: 1337, dtype: int64

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

In [23]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6492537313432836


In [25]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [27]:
y_svmpred=svm_model.predict(x_test)

In [28]:
print("Accuracy:",metrics.accuracy_score(y_test, y_svmpred))

Accuracy: 0.7761194029850746


In [29]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5, n_estimators=100, oob_score=True)

In [31]:
classifier_rf.fit(x_train, y_train)


In [32]:
y_rfcpred=classifier_rf.predict(x_test)

In [33]:
print("Accuracy:",metrics.accuracy_score(y_test, y_rfcpred))

Accuracy: 0.7761194029850746
