# 50 Start-Ups: Exploratory Data Analysis

#### Importing Required Libraries

In [1]:
# Data Analysis and Wrangling:
import pandas as pd
import numpy as np

# Visualization:
import matplotlib.pyplot as plt
%matplotlib inline

# Machine learning:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

#### Acquiring Data:

In [2]:
data = pd.read_csv('50_Startups.csv')

#### Previewing the Data:

In [3]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
print(data.columns.values)

['R&D Spend' 'Administration' 'Marketing Spend' 'State' 'Profit']


Out of these, all features are numerical, except for 'State'.

#### Describing the Data:

In [5]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


Here, we found out the min, max, mean values of our numerical features.

#### Dividing the dataset into Features and Label:

In [6]:
features = data.iloc[:,:-1].values
label = data.iloc[:,[-1]].values

Thus, we have sucessfully divided the data into features and labels.

#### Handling the Categorical Data:

In [7]:
transformer = ColumnTransformer(transformers=[("OneHot",OneHotEncoder(),[3])],
remainder = 'passthrough')
features = transformer.fit_transform(features.tolist())
features

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

Thus, we have encoded our Categorical variable using OneHotEncoding.

#### Train - Test Split:

In [8]:
X_train,X_test,y_train,y_test = train_test_split(features, label, test_size = 0.2, random_state = 42)

Thus, we have sucessfully splitted the data into Train & Test sets.

###### Now, we will train our models using:
1) Linear Regression
2) Decision Tree Regression
3) Random Forest Regression

#### 1) Linear Regression:

In [9]:
LR = LinearRegression()
LR.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Printing the scores:

In [10]:
print("The Linear Regression model has scored",LR.score(X_train,y_train)*100,"% on Train set.")
print("The Linear Regression model has scored",LR.score(X_test,y_test)*100,"% on Test set.")

The Linear Regression model has scored 95.37019995248525 % on Train set.
The Linear Regression model has scored 89.87266414318765 % on Test set.


#### 2) Decision Tree Regression:

In [11]:
DTR1 = DecisionTreeRegressor(max_depth=3)
DTR1.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

Printing the scores:

In [12]:
print("The above Decision Tree Regression model has scored",DTR1.score(X_train,y_train)*100,"% on Train set.")
print("The above Decision Tree Regression model has scored",DTR1.score(X_test,y_test)*100,"% on Test set.")

The above Decision Tree Regression model has scored 96.86293734459093 % on Train set.
The above Decision Tree Regression model has scored 80.01150060564002 % on Test set.


Since, the accuracy is pretty low, we'll check for various values of max_depth.

In [13]:
DTR2 = DecisionTreeRegressor(max_depth=5)
DTR2.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [14]:
print("The above Decision Tree Regression model has scored",DTR2.score(X_train,y_train)*100,"% on Train set.")
print("The above Decision Tree Regression model has scored",DTR2.score(X_test,y_test)*100,"% on Test set.")

The above Decision Tree Regression model has scored 99.95501896140128 % on Train set.
The above Decision Tree Regression model has scored 87.09169218937059 % on Test set.


In [15]:
DTR3 = DecisionTreeRegressor(max_depth=7)
DTR3.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [16]:
print("The above Decision Tree Regression model has scored",DTR3.score(X_train,y_train)*100,"% on Train set.")
print("The above Decision Tree Regression model has scored",DTR3.score(X_test,y_test)*100,"% on Test set.")

The above Decision Tree Regression model has scored 99.99999682547107 % on Train set.
The above Decision Tree Regression model has scored 84.6206188911994 % on Test set.


In [17]:
DTR4 = DecisionTreeRegressor(max_depth=9)
DTR4.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=9, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [18]:
print("The above Decision Tree Regression model has scored",DTR4.score(X_train,y_train)*100,"% on Train set.")
print("The above Decision Tree Regression model has scored",DTR4.score(X_test,y_test)*100,"% on Test set.")

The above Decision Tree Regression model has scored 100.0 % on Train set.
The above Decision Tree Regression model has scored 88.56647585965027 % on Test set.


Thus, we get the highest accuracy with DTR4 model, where max_depth is set as 9.

#### 3) Random Forest Regression:

In [19]:
RF1 = RandomForestRegressor(n_estimators = 3)
RF1.fit(X_train,y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [20]:
print("The above Random Forest Regression model has scored",RF1.score(X_train,y_train)*100,"% on Train set.")
print("The above Random Forest Regression model has scored",RF1.score(X_test,y_test)*100,"% on Test set.")

The above Random Forest Regression model has scored 98.16911655414955 % on Train set.
The above Random Forest Regression model has scored 68.98407388267258 % on Test set.


We can try experimenting with the value of n_estimators to check for the most accurate result.

In [21]:
for i in range(4,10):
    RF=RandomForestRegressor(n_estimators = i)
    RF.fit(X_train,y_train.ravel())
    print("n_estimator =",i)
    print("Training Score =",RF.score(X_train,y_train)*100,"%")
    print("Testing Score =",RF.score(X_test,y_test)*100,"%")
    print("\n")

n_estimator = 4
Training Score = 98.576835269917 %
Testing Score = 88.25626962338598 %


n_estimator = 5
Training Score = 98.4768680813041 %
Testing Score = 90.93102069211464 %


n_estimator = 6
Training Score = 98.20192810593517 %
Testing Score = 83.1659740535956 %


n_estimator = 7
Training Score = 97.79397892958497 %
Testing Score = 90.57646972861552 %


n_estimator = 8
Training Score = 98.97998482226248 %
Testing Score = 86.2269609630696 %


n_estimator = 9
Training Score = 98.72252201672461 %
Testing Score = 90.15192888149947 %




Thus, we found out the highest testing score when n_estimators were set to 5.

#### Conclusion:

In [22]:
print("Accuracy of Linear Regression model: 89.87%")
print("Accuracy of Decision Tree Regression model: 88.57%")
print("Accuracy of Random Forest Regression model: 90.93%")

Accuracy of Linear Regression model: 89.87%
Accuracy of Decision Tree Regression model: 88.57%
Accuracy of Random Forest Regression model: 90.93%


Thus, we can conclude that, Random Forest Regression model works the best here.

#### Thank You!