In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
dataset.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


In [4]:
X = dataset.iloc[:,:-1]
Y = dataset.iloc[:, -1].values
dataset.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
#ColumnTransformer: This is a utility provided by scikit-learn for applying different transformations to different columns of a dataset.
#transformers=[('encoder', OneHotEncoder(), [3])]: Here, you are specifying a transformation for the columns in the dataset. It looks like you are applying a one-hot encoding transformation (OneHotEncoder) to the column at index 3 ([3]).
#This means that the values in this column will be converted into binary vectors representing categories.
#remainder='passthrough': This parameter specifies that any columns not specified in the transformers list should be left unchanged. In this case, it indicates that columns other than column 3 should remain unchanged.
#X = np.array(ct.fit_transform(X)): This line applies the transformations specified by the ColumnTransformer to the input data X. fit_transform() method fits the transformer to the data and then transforms it. Finally, np.array() is used to convert the transformed data into a NumPy array.

In [6]:
X

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [8]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)

In [10]:

Y_pred = regressor.predict(X_test)

In [11]:
df = pd.DataFrame({'Real values':Y_test,'Predicted values':Y_pred})
df

Unnamed: 0,Real values,Predicted values
0,69758.98,58821.007653
1,14681.4,50297.716229
2,96479.51,86875.480008
3,105733.54,110861.961208
4,155752.6,159474.23402
5,141585.52,128223.260664
6,144259.4,136169.906286
7,90708.19,74249.499262
8,65200.33,68199.191013
9,81229.06,65389.942186


In [13]:
#rmse
from sklearn import metrics
print(np.sqrt(metrics.mean_squared_error(Y_test,Y_pred)))

15150.244490090985
