# Step 1: Data Preprocessing

### Importing the libraries

In [1]:
import pandas as pd
import numpy as np

#### Importing the dataset

In [2]:
dataset =  pd.read_csv('data_files/50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
dataset.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


In [4]:
val = set(dataset['State'])
print(val)

{'California', 'New York', 'Florida'}


In [5]:
X = dataset.iloc[:,:-1].to_numpy()
Y = dataset.iloc[:,4].to_numpy()

#### Encoding Categorical data

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Apply OneHotEncoder to column index 3
ct = ColumnTransformer(
    transformers=[("encoder", OneHotEncoder(), [3])],
    remainder="passthrough"  # keep other columns as they are
)

X = ct.fit_transform(X)


In [7]:
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

In [8]:
# Get updated feature names
feature_names = ct.get_feature_names_out()
print(feature_names)

['encoder__x3_California' 'encoder__x3_Florida' 'encoder__x3_New York'
 'remainder__x0' 'remainder__x1' 'remainder__x2']


#### Avoiding Dummy Variable Trap

In [9]:
X = X[: , 1:]

X

#### Splitting the dataset into the Training set and Test set

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)


#### Step 2: Fitting Multiple Linear Regression to the Training set

In [12]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)


#### Step 3: Predicting the Test set results

In [13]:
y_pred = regressor.predict(X_test)

In [14]:
print(regressor.coef_)
print(regressor.intercept_)


[-9.59284160e+02  6.99369053e+02  7.73467193e-01  3.28845975e-02
  3.66100259e-02]
42554.167617767


#### quick evaluation metric

In [15]:
from sklearn.metrics import r2_score

print(r2_score(Y_test, y_pred))


0.9347068473282987
