# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load Dataset

In [2]:
dataset = pd.read_csv('/content/50_Startups.csv') # Read the csv contatining data
X = dataset.iloc[:,:-1].values # Getting all columns except target col
y = dataset.iloc[:,-1].values # getting target col
X.shape, y.shape #printing the shape

((50, 4), (50,))

In [3]:
X[:10]

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California']], dtype=object)

# Encoding Categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Column transformer takes the onehotencoder object and maps it over 3rd col
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [5]:
X[:10] #State column changed to 3 different columns 0 0 1, 1 0 0, etc

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62]], dtype=object)

# Splitting Dataset in train/test set

In [6]:
from sklearn.model_selection import train_test_split

# Splitting the whole dataset in train and test parts, 
# test_size=0.2 means 20% of training data will be used for testing 

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0)

In [7]:
X_train.shape, X_test.shape # Shape of the train and test set

((40, 6), (10, 6))

# Training Multiple Linear Regression model on the training set

In [8]:
# import the linear regression library from scikit-learn
from sklearn.linear_model import LinearRegression

# creating lr model
regressor = LinearRegression() 

# fit the lr model
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Predicting the Test Results

In [9]:
np.set_printoptions(precision=2) # Round numeric values to 2 decimals

# Getting predictions from our model
predictions = regressor.predict(X_test)
print(np.concatenate((predictions.reshape(len(predictions),1), y_test.reshape(len(y_test), 1)), axis=1)) 

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]
