In [1]:
# Import libraries
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Import dataset
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
# Delete smoker variable, because smoker independent variable has very less correlation with dependent variable (tip)
del df['smoker']

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,day,time,size
0,16.99,1.01,Female,Sun,Dinner,2
1,10.34,1.66,Male,Sun,Dinner,3
2,21.01,3.5,Male,Sun,Dinner,3
3,23.68,3.31,Male,Sun,Dinner,2
4,24.59,3.61,Female,Sun,Dinner,4


In [7]:
# Convert non-numeric data using one-hot encoding
df = pd.get_dummies(df, columns=['time', 'day','sex'])

In [8]:
df.head()

Unnamed: 0,total_bill,tip,size,time_Dinner,time_Lunch,day_Fri,day_Sat,day_Sun,day_Thur,sex_Female,sex_Male
0,16.99,1.01,2,True,False,False,False,True,False,True,False
1,10.34,1.66,3,True,False,False,False,True,False,False,True
2,21.01,3.5,3,True,False,False,False,True,False,False,True
3,23.68,3.31,2,True,False,False,False,True,False,False,True
4,24.59,3.61,4,True,False,False,False,True,False,True,False


In [9]:
# Assign X and y variables
X = df.drop('tip',axis=1)
y = df['tip']

In [10]:
# Split data into test/train set (70/30 split) and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [11]:
# Assign algorithm
model = DecisionTreeRegressor()

In [12]:
# Link algorithm to X and y variables
model.fit(X_train, y_train)

In [13]:
# Check prediction error for training data using MAE
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print ("Training Set Mean Absolute Error: %.2f" % mae_train)

# Check prediction error for test data using MAE
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print ("Test Set Mean Absolute Error: %.2f" % mae_test)


Training Set Mean Absolute Error: 0.00
Test Set Mean Absolute Error: 0.98


In [14]:
# model is overfitting the training data

In [15]:
# Data point to predict
jamie = [
	40, #total_bill
	2, #size
	1, #time_dinner
	0, #time_lunch
	1, #day_fri
	0, #day_sat
	0, #day_sun
	0, #day_thur
	1, #sex_female
	0, #sex_male
]

# Make prediction
jamie = model.predict([jamie])
jamie



array([4.67])