# DECISION TREE REGRESSION

In [50]:
from warnings import filterwarnings
filterwarnings('ignore')

In [51]:
import pandas as pd
df = pd.read_csv('50_Startups.csv')

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [53]:
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [54]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

# Seprate X and Y 

In [55]:
X = df.drop(columns=['PROFIT'])
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [56]:
Y = df[['PROFIT']]
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


# Seprate CAT AND CON

In [57]:
cat = list(X.columns[X.dtypes=='object'])
cat

['STATE']

In [58]:
con = list(X.columns[X.dtypes!='object'])
con

['RND', 'ADMIN', 'MKT']

# PREPROCESSING PIPELINE 

In [59]:
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [60]:
num_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='median')),
                           ('scaler',StandardScaler())])

In [61]:
num_pipe

In [62]:
cat_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                           ('OHE',OneHotEncoder(handle_unknown='ignore'))])
cat_pipe

In [63]:
pre = ColumnTransformer([('num',num_pipe,con),
                         ('cat',cat_pipe,cat)])
pre

In [64]:
X_pre = pre.fit_transform(X)
X_pre[0:5]

array([[ 2.01641149,  0.56075291,  2.15394309,  0.        ,  0.        ,
         1.        ],
       [ 1.95586034,  1.08280658,  1.9236004 ,  1.        ,  0.        ,
         0.        ],
       [ 1.75436374, -0.72825703,  1.62652767,  0.        ,  1.        ,
         0.        ],
       [ 1.55478369, -0.09636463,  1.42221024,  0.        ,  0.        ,
         1.        ],
       [ 1.5049372 , -1.07991935,  1.28152771,  0.        ,  1.        ,
         0.        ]])

In [65]:
cols = pre.get_feature_names_out()
cols

array(['num__RND', 'num__ADMIN', 'num__MKT', 'cat__STATE_California',
       'cat__STATE_Florida', 'cat__STATE_New York'], dtype=object)

In [66]:
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


# Train Test Split


In [67]:
from sklearn.model_selection import train_test_split
xtrain , xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.33)

In [68]:
xtrain.shape

(33, 6)

In [69]:
xtest.shape

(17, 6)

# CREATE A DECISON TREE MODEL

In [70]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=1, min_samples_split=5, min_samples_leaf=6, criterion='squared_error')

In [71]:
model.fit(xtrain,ytrain)

In [72]:
model.score(xtrain,ytrain)

0.6754173351752588

In [73]:
model.score(xtest,ytest)

0.5581204121083325

#HYPERPARAMTER TUNNING

In [74]:
params = {'max_depth':[1,2,3,4,5,6,7,8],
          'min_samples_leaf':[3,4,5,6,7,8],
          'min_samples_split':[5,6,7,8,9,10],
          'criterion':['squared_error','absolute_error']}

In [75]:
from sklearn.model_selection import GridSearchCV
dtr = DecisionTreeRegressor()

In [77]:
gscv = GridSearchCV(dtr, param_grid=params, cv=5, scoring='neg_mean_squared_error')
gscv.fit(xtrain, ytrain)

In [78]:
gscv.best_params_

{'criterion': 'squared_error',
 'max_depth': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 5}

In [79]:
gscv.best_score_

-230424299.0464708

In [81]:
best_dtr = gscv.best_estimator_
best_dtr

# EVALUATE THE MODEL

In [82]:
best_dtr.score(xtrain,ytrain)

0.9496322819067823

In [84]:
best_dtr.score(xtest,ytest)

0.8195230971303326

# Predict the model results

In [86]:
ypred_ts = model.predict(xtest)
ypred_ts

array([ 86428.8255    , 148505.23538462, 148505.23538462, 148505.23538462,
        86428.8255    ,  86428.8255    ,  86428.8255    ,  86428.8255    ,
        86428.8255    ,  86428.8255    , 148505.23538462, 148505.23538462,
        86428.8255    ,  86428.8255    ,  86428.8255    ,  86428.8255    ,
       148505.23538462])

In [87]:
ypred_tr = model.predict(xtrain)
ypred_tr

array([148505.23538462,  86428.8255    ,  86428.8255    ,  86428.8255    ,
       148505.23538462, 148505.23538462, 148505.23538462,  86428.8255    ,
       148505.23538462,  86428.8255    , 148505.23538462,  86428.8255    ,
       148505.23538462,  86428.8255    ,  86428.8255    ,  86428.8255    ,
        86428.8255    ,  86428.8255    , 148505.23538462, 148505.23538462,
        86428.8255    , 148505.23538462,  86428.8255    ,  86428.8255    ,
       148505.23538462, 148505.23538462,  86428.8255    ,  86428.8255    ,
        86428.8255    ,  86428.8255    , 148505.23538462,  86428.8255    ,
        86428.8255    ])

# Check model MSE MAE RMSE R2