In [27]:
import warnings
warnings.filterwarnings("ignore")

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [28]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
print(dataset.columns)
print(dataset.info())

X = dataset.drop('Profit', axis=1)
y = dataset['Profit']

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB
None


In [29]:
# Creating dummy varialbe
X_new = pd.get_dummies(X,columns=['State'])
# Avoiding the Dummy Variable Trap
X_new = X_new.drop('State_New York', axis=1)

In [30]:
print(type(X_new))
print(X_new.columns)
#X['State_California'] = X['State_California']
#X['State_Florida'] = X['State_Florida']
print(X_new.info())

<class 'pandas.core.frame.DataFrame'>
Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State_California',
       'State_Florida'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend           50 non-null float64
Administration      50 non-null float64
Marketing Spend     50 non-null float64
State_California    50 non-null uint8
State_Florida       50 non-null uint8
dtypes: float64(3), uint8(2)
memory usage: 1.3 KB
None


In [31]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.2, random_state = 17)

## Lasso

In [32]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha=0.1,max_iter=1e6)
lasso.fit(X_train,y_train)
print('Lasso Regression R squared": %.4f' % lasso.score(X_test, y_test))

Lasso Regression R squared": 0.9285


In [33]:
print(lasso.coef_)

[ 8.02499283e-01 -5.56268980e-02  2.35560627e-02 -1.97248086e+03
 -2.55783116e+02]


In [34]:
print(lasso.intercept_)

55595.61447930104


In [51]:
print(dict(zip(X.columns, lasso.coef_)))

{'R&D Spend': 0.802499282538742, 'Administration': -0.055626897982054654, 'Marketing Spend': 0.0235560626583549, 'State': -1972.4808584861253}


In [35]:
y_pred = lasso.predict(X_test)
# Calculate root-mean-square error (RMSE)
import numpy as np
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_pred, y_test)
lin_rmse = np.sqrt(lin_mse)
print('Linear Regression RMSE: %.4f' % lin_rmse)

Linear Regression RMSE: 7373.0712


## Decision Tree

In [41]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train,y_train)
print('Decision Tree Regression R squared": %.4f' % regressor.score(X_test, y_test))

Decision Tree Regression R squared": 0.7716


In [42]:
# Calculate root-mean-square error (RMSE)
import numpy as np
from sklearn.metrics import mean_squared_error
y_pred = regressor.predict(X_test)
lin_mse = mean_squared_error(y_pred, y_test)
lin_rmse = np.sqrt(lin_mse)
print('Linear Regression RMSE: %.4f' % lin_rmse)

Linear Regression RMSE: 13178.3159


In [44]:
importances = regressor.feature_importances_
print(importances)

[0.96161758 0.00599153 0.00917031 0.01999448 0.00322611]


In [50]:
print(dict(zip(X.columns, regressor.feature_importances_)))

{'R&D Spend': 0.961617575654419, 'Administration': 0.005991532740242394, 'Marketing Spend': 0.009170307140578452, 'State': 0.019994476425611495}


## Random Forest

In [45]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators = 1000,random_state = 0)
forest_reg.fit(X_train, y_train)
print('Random Forest R squared": %.4f' % forest_reg.score(X_test, y_test))

Random Forest R squared": 0.8933


In [46]:
import numpy as np
from sklearn.metrics import mean_squared_error
y_pred = forest_reg.predict(X_test)
forest_mse = mean_squared_error(y_pred, y_test)
forest_rmse = np.sqrt(forest_mse)
print('Random Forest RMSE: %.4f' % forest_rmse)

Random Forest RMSE: 9004.8089


In [47]:
importances = forest_reg.feature_importances_
print(importances)

[0.91444553 0.00771274 0.07157854 0.00484884 0.00141434]


In [49]:
print(dict(zip(X.columns, forest_reg.feature_importances_)))

{'R&D Spend': 0.914445532894459, 'Administration': 0.007712743551249497, 'Marketing Spend': 0.07157854075078544, 'State': 0.0048488434799799154}
