In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading data

In [3]:
df=pd.read_csv('vehicles.csv')
df.head(8)

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,,bicycle
1,8,male,,scooter
2,10,female,,bicycle
3,14,male,,metro
4,16,male,,metro
5,18,female,,metro
6,20,male,200.0,scooter
7,22,female,500.0,scooter


In [4]:
df.dtypes

Age                     int64
Gender                 object
Income                float64
Favorite Transport     object
dtype: object

## Cleaning

In [5]:
df.isnull().sum()

Age                   0
Gender                0
Income                6
Favorite Transport    0
dtype: int64

In [6]:
df['Income'].fillna(0.0, inplace=True)
df.head()

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,female,0.0,bicycle
1,8,male,0.0,scooter
2,10,female,0.0,bicycle
3,14,male,0.0,metro
4,16,male,0.0,metro


In [7]:
df.isnull().sum()

Age                   0
Gender                0
Income                0
Favorite Transport    0
dtype: int64

## Encoding

In [8]:
df['Gender'].unique()

array(['female', 'male'], dtype=object)

In [9]:
df.replace({
    'Gender' : {'male' : 0,'female' : 1}
}, inplace=True)
df.head(3)

Unnamed: 0,Age,Gender,Income,Favorite Transport
0,5,1,0.0,bicycle
1,8,0,0.0,scooter
2,10,1,0.0,bicycle


In [10]:
df.dtypes

Age                     int64
Gender                  int64
Income                float64
Favorite Transport     object
dtype: object

## Model

In [11]:
X=df.drop(columns='Favorite Transport')
X.head(3)

Unnamed: 0,Age,Gender,Income
0,5,1,0.0
1,8,0,0.0
2,10,1,0.0


In [12]:
y=df['Favorite Transport']
y.head(3)

0    bicycle
1    scooter
2    bicycle
Name: Favorite Transport, dtype: object

In [13]:
model=DecisionTreeClassifier()
model

DecisionTreeClassifier()

In [14]:
model.fit(X,y)

DecisionTreeClassifier()

## Prediction

In [15]:
test_df= pd.DataFrame({
    'Age': [12, 30, 75],
    'Gender': [0, 0, 1],
    'Income': [0.0, 4000, 50000]
})
test_df

Unnamed: 0,Age,Gender,Income
0,12,0,0.0
1,30,0,4000.0
2,75,1,50000.0


In [16]:
model.predict(test_df)

array(['scooter', 'car', 'helicopter'], dtype=object)

## Evaluation

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
model=DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [19]:
X_test

Unnamed: 0,Age,Gender,Income
2,10,1,0.0
18,56,1,1400.0
0,5,1,0.0
1,8,0,0.0
9,25,1,800.0
25,75,0,30000.0


In [20]:
predictions=model.predict(X_test)
predictions

array(['metro', 'taxi', 'metro', 'metro', 'scooter', 'helicopter'],
      dtype=object)

In [21]:
model_accuracy_score=accuracy_score(y_test, predictions)
model_accuracy_score

0.3333333333333333

## Exporting to the DOT file

In [22]:
from sklearn import tree

In [23]:
tree.export_graphviz(model, out_file='decision_tree_mode.dot', feature_names=['Age', 'Gender', 'Income'], filled=True, class_names=sorted(y.unique()))

## Charts

In [24]:
import seaborn as sns
import matplotlib as plt


ModuleNotFoundError: No module named 'seaborn'

In [None]:
sns.countplot(x=df['Gender'], hue=df['Favorite Transport'])
plt.show()

In [25]:
sns.histplot(x=df['Income'], hue=df['Favorite Transport'])
plt.show()

NameError: name 'sns' is not defined