## Loading Data

In [243]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt

In [222]:
df = pd.read_csv('vehicles.csv')
df

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,,bicycle
1,8,male,,scooter
2,10,female,,bicycle
3,14,male,,metro
4,16,male,,metro
5,18,female,,metro
6,20,male,200.0,scooter
7,22,female,500.0,scooter
8,23,male,300.0,scooter
9,25,female,800.0,metro


In [223]:
df.dtypes

Age                     int64
Gender                 object
Income                float64
Favorite Transport     object
dtype: object

## Cleaning

In [224]:
df.isnull()

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,False,False,True,False
1,False,False,True,False
2,False,False,True,False
3,False,False,True,False
4,False,False,True,False
5,False,False,True,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [225]:
df['Income'].fillna(0.0, inplace=True)
df.head(8)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Income'].fillna(0.0, inplace=True)


Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,0.0,bicycle
1,8,male,0.0,scooter
2,10,female,0.0,bicycle
3,14,male,0.0,metro
4,16,male,0.0,metro
5,18,female,0.0,metro
6,20,male,200.0,scooter
7,22,female,500.0,scooter


In [226]:
df.isnull().sum()

Age                   0
Gender                0
Income                0
Favorite Transport    0
dtype: int64

In [227]:
df

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,0.0,bicycle
1,8,male,0.0,scooter
2,10,female,0.0,bicycle
3,14,male,0.0,metro
4,16,male,0.0,metro
5,18,female,0.0,metro
6,20,male,200.0,scooter
7,22,female,500.0,scooter
8,23,male,300.0,scooter
9,25,female,800.0,metro


## Encoding

In [228]:
# OPTION 1
# df.replace(['male', 'female'], [0, 1], inplace=True)
# df.dtypes

In [229]:
# OPTION 2
label_encoder = LabelEncoder()
label_encoder.fit_transform(df['Gender'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,0,0.0,bicycle
1,8,1,0.0,scooter
2,10,0,0.0,bicycle
3,14,1,0.0,metro
4,16,1,0.0,metro
5,18,0,0.0,metro
6,20,1,200.0,scooter
7,22,0,500.0,scooter
8,23,1,300.0,scooter
9,25,0,800.0,metro


## Model

In [230]:
X = df.drop(columns='Favorite Transport')
X.head(3)

Unnamed: 0,Age,Gender,Income
0,5,0,0.0
1,8,1,0.0
2,10,0,0.0


In [231]:
y = df['Favorite Transport']
y.head(3)

0    bicycle
1    scooter
2    bicycle
Name: Favorite Transport, dtype: object

In [232]:
model = DecisionTreeClassifier()
model

In [233]:
model.fit(X, y)

## Prediction

In [234]:
test_df = pd.DataFrame({
    'Age': [42, 30, 75],
    'Gender': [0, 1, 0],
    'Income': [0.0, 4000, 50000]
})
test_df

Unnamed: 0,Age,Gender,Income
0,42,0,0.0
1,30,1,4000.0
2,75,0,50000.0


In [235]:
model.predict(test_df)

array(['metro', 'car', 'helicopter'], dtype=object)

## Exporting to the DOT file

In [236]:
tree.export_graphviz(
    model,
    out_file='decision_tree_model.dot',
    feature_names=['Age', 'Gender', 'Income'],
    filled=True,
    class_names=sorted(y.unique())
    )

## Evaluation

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape

(20, 3)

In [238]:
X.shape

(26, 3)

In [239]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [240]:
X_test

Unnamed: 0,Age,Gender,Income
3,14,1,0.0
20,59,0,5500.0
6,20,1,200.0
8,23,1,300.0
23,70,0,2500.0
5,18,0,0.0


In [241]:
predictions = model.predict(X_test)
predictions

array(['metro', 'car', 'scooter', 'scooter', 'helicopter', 'metro'],
      dtype=object)

In [242]:
model_accuracy_score = accuracy_score(y_test, predictions)
model_accuracy_score

0.8333333333333334