In [20]:
import numpy as np
import pandas as pd

In [21]:
startupData = pd.read_csv('50_Startups.csv')

In [15]:
startupData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [8]:
#Perform LinearRegression
#Check and ensure the features has a linear relationship
#To check the linear relationship there exists two methods
# 1. use corr() Statistical EDA (Understanding the data)
# 2. use pairplot() Visual EDA

#Feature Elimination using correlation analysis (Feature Engineering)
startupData.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [22]:
#Seperate the data as features and label
#Regression -- Sklearn expects your data (feature and label to be two dimensional)
features = startupData.iloc[:,[0,1,2,3]].values
label = startupData.iloc[:,[4]].values

In [23]:
#Handle Categorical Data
#Sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
countryLabel = LabelEncoder()
features[:,3] = countryLabel.fit_transform(features[:,3])

In [24]:
countryOHE = OneHotEncoder(categorical_features=[3])
features = countryOHE.fit_transform(features).toarray()
features

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [26]:
pd.concat([pd.DataFrame(features),pd.DataFrame(label)],axis = 1).corr()
#pd.DataFrame(features).corr()

Unnamed: 0,0,1,2,3,4,5,0.1
0,1.0,-0.492366,-0.515152,-0.143165,-0.015478,-0.168875,-0.145837
1,-0.492366,1.0,-0.492366,0.105711,0.010493,0.205685,0.116244
2,-0.515152,-0.492366,1.0,0.039068,0.005145,-0.03367,0.031368
3,-0.143165,0.105711,0.039068,1.0,0.241955,0.724248,0.9729
4,-0.015478,0.010493,0.005145,0.241955,1.0,-0.032154,0.200717
5,-0.168875,0.205685,-0.03367,0.724248,-0.032154,1.0,0.747766
0,-0.145837,0.116244,0.031368,0.9729,0.200717,0.747766,1.0


In [17]:
#Create train test split 80-20 rule
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=10)

In [18]:
#Create the model(Training the model)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train) #Create the equation

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
#lets check the best score possible
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
for i in range(1,51):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=i)
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score=model.score(X_train,y_train)
    test_score=model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Testing: {} Training: {} Seed: {}".format(test_score,train_score,i))

Testing: 0.9646437135748334 Training: 0.9423713608840596 Seed: 1
Testing: 0.9814177491535382 Training: 0.9388772018080951 Seed: 2
Testing: 0.9612876028942208 Training: 0.9459212470646745 Seed: 3
Testing: 0.9674854200887459 Training: 0.9455331844858769 Seed: 4
Testing: 0.9683604384024199 Training: 0.9436198878593198 Seed: 5
Testing: 0.9909864896179557 Training: 0.9382176532996814 Seed: 10
Testing: 0.9565036617363222 Training: 0.9472766838360558 Seed: 12
Testing: 0.9499139926727362 Training: 0.9485793735881266 Seed: 13
Testing: 0.97371375097723 Training: 0.9410506991241665 Seed: 14
Testing: 0.947548050595194 Training: 0.9471972623797911 Seed: 17
Testing: 0.9658516680116018 Training: 0.9457936305980141 Seed: 21
Testing: 0.9764404302143169 Training: 0.9421707561468369 Seed: 22
Testing: 0.9692234650571673 Training: 0.9461648988838399 Seed: 24
Testing: 0.9613574909391511 Training: 0.9452633616069059 Seed: 26
Testing: 0.9574967178299124 Training: 0.9462638680340698 Seed: 29
Testing: 0.9564046