In [4]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split


In [11]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [10]:
tips_dataframe = sns.load_dataset("tips")
tips_dataframe.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


#### we will be using machine learning algorthims to predict the tip for each record

#### we divide  the data into target and features
#### that is tips vs the remaining data

In [11]:
x = tips_dataframe.drop("tip", axis=1)
y  = tips_dataframe[["tip"]]

In [16]:
x.head()


Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [34]:
y.head()

Unnamed: 0,tip
0,1.01
1,1.66
2,3.5
3,3.31
4,3.61


In [12]:
## Machine learning algorithms work better with numbers , hence it is important that we convert
## categorical data into numeric format

x_copy_all = x.copy(deep=True)
x_copy_categorical = x_copy_all.drop(["total_bill","size"], axis =1)
x_copy_numerical = x_copy_all[["total_bill","size"]]

x_copy_numerical

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4
...,...,...
239,29.03,3
240,27.18,2
241,22.67,2
242,17.82,2


In [13]:
x_copy_categorical

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner
...,...,...,...,...
239,Male,No,Sat,Dinner
240,Female,Yes,Sat,Dinner
241,Male,Yes,Sat,Dinner
242,Male,No,Sat,Dinner


In [14]:
# convert categorical variables in to dummy /indicator variables
cat_numerical = pd.get_dummies(x_copy_categorical, drop_first=True)
cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [47]:
x_ready =pd.concat([x_copy_numerical,cat_numerical] , axis=1)
x_ready.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


#### we now divide data into  training and test sets
*The dataset is trained via the training set and evaluated
on the test set.*

In [20]:
y

Unnamed: 0,tip
0,1.01
1,1.66
2,3.50
3,3.31
4,3.61
...,...
239,5.92
240,2.00
241,2.00
242,1.75


In [43]:
x_ready_train,x_ready_test,y_train,y_test = train_test_split(x_ready,y, test_size=0.2,random_state=0)

In [46]:
x_ready.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


In [48]:
from sklearn.preprocessing import StandardScaler

sc =  StandardScaler()
## Scaling the traning set
X_train  = sc.fit_transform(x_ready_train)
## Scaling the test set
X_test = sc.transform(x_ready_test)

In [49]:
from sklearn.linear_model import LinearRegression

In [50]:
# Training the algorithm
reg = LinearRegression()

#

regressor  = reg.fit(X_train,y_train)

# Making prediction on the test set
X_train

array([[ 0.73240914,  1.48790007, -0.69084928, ..., -0.74001287,
         1.48213404,  0.61885275],
       [ 1.36827467, -0.62221275, -0.69084928, ..., -0.74001287,
        -0.67470281, -1.61589329],
       [-0.25318243, -0.62221275, -0.69084928, ..., -0.74001287,
         1.48213404,  0.61885275],
       ...,
       [-1.04691802, -0.62221275,  1.44749373, ..., -0.74001287,
        -0.67470281, -1.61589329],
       [ 1.33757772,  1.48790007, -0.69084928, ..., -0.74001287,
         1.48213404,  0.61885275],
       [-1.41966678, -0.62221275, -0.69084928, ..., -0.74001287,
         1.48213404,  0.61885275]])

In [51]:
y_predicted = regressor.predict(X_test)


In [52]:
from sklearn import metrics
print ('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_predicted))
print ('Mean Squared Error:,' ,metrics.mean_squared_error(y_test, y_predicted))
print ('Root Mean Squared Error:' , np.sqrt(metrics.mean_squared_error(y_test, y_predicted)))

Mean Absolute Error:  0.708021883297983
Mean Squared Error:, 0.8939195221609613
Root Mean Squared Error: 0.9454731736865734
