In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics

%matplotlib inline

##### Importing 50_Startups dataset using pandas

In [2]:
startup_data = pd.read_csv('50_Startups.csv')
startup_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
startup_data.shape

(50, 5)

##### Data Cleaning Process

##### Finding NaN value in the dataset

In [4]:
startup_data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
startup_data['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [6]:
startup_data['State'] = startup_data['State'].replace({'New York':1, 'California':2, 'Florida': 3 })

In [7]:
startup_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,2,191792.06
2,153441.51,101145.55,407934.54,3,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,3,166187.94


##### Using LabelEncoder() for converting categorical values into numeric

In [8]:
le = LabelEncoder()

In [9]:
startup_data['R&D Spend'] = le.fit_transform(startup_data['R&D Spend'])
startup_data['Administration'] = le.fit_transform(startup_data['Administration'])
startup_data['Marketing Spend'] = le.fit_transform(startup_data['Marketing Spend'])
startup_data['Profit'] = le.fit_transform(startup_data['Profit'])

In [10]:
startup_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,48,34,47,1,49
1,47,41,46,2,48
2,46,11,45,3,47
3,45,22,44,1,46
4,44,6,43,3,45


##### Dividing the dataset into independent and dependent columns
##### Independent dataset represented by 'X' and dependent represented by 'y'

In [11]:
y = np.array(startup_data['Profit'])
y = y.reshape(-1,1)

In [12]:
X = startup_data.iloc[:, startup_data.columns != 'Profit']
X = X.drop('State', axis = 'columns')
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,48,34,47
1,47,41,46
2,46,11,45
3,45,22,44
4,44,6,43


##### Splitting of the given dataset using train_test_split method
##### Training size = 80 % and Testing size = 20 %

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
len(X_train)

40

In [15]:
len(X_test)

10

In [16]:
X_train.shape

(40, 3)

In [17]:
y_train.shape

(40, 1)

In [18]:
X_test.shape

(10, 3)

##### Standardization of X_train and y_train

In [19]:
scaler = StandardScaler()

In [20]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

##### Using linear regression algorithm

In [29]:
linearReg = LinearRegression()

In [30]:
linearReg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
linearReg.coef_

array([[13.47568818, -0.24841698,  0.85660043]])

In [33]:
linearReg.intercept_

array([22.625])

##### Predicting the Test set result

In [27]:
y_pred = linearReg.predict(X_test)

##### Calculation of r2 score

In [28]:
print('r2 score:',metrics.r2_score(y_test,y_pred))

r2 score: 0.9641028903894596
