In [None]:
## Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
## Importing Dataset
df = pd.read_csv("50_Startups.csv")
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [None]:
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

In [None]:
print(y)

[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51
 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35
 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03
 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31
 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8
  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83
  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41
  14681.4 ]


In [None]:
# ISSUE_1 - use mean to handle missing data, and apply

df.isnull().sum() #checks if there is any null value in each column
#fillna is used to replace the null values
#mean is used to replace the values with the mean of all values present in column
#condition of numeric is entered because some columns have non-numeric datatype
df=df.fillna(df.mean(numeric_only=True))


In [69]:
# ISSUE_2 - Encoding categorical column, using one hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output = False) #converts sparse matrix into dense matrix
encoded_data = encoder.fit_transform(df[['State']]) #encodes data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['State'])) #organises encoded data into a dataframe
#get_feature_names_out generates meaningful names for new columns
df = pd.concat([df, encoded_df], axis=1) #adds encoded columns to df
df.drop('State', axis=1, inplace=True) #removes State column from df

df.head()

KeyError: "None of [Index(['State'], dtype='object')] are in the [columns]"

In [None]:
# ISSUE_3 - Split data into train and test with 80/20 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)      # Splitting in 80:20 Ratio


In [65]:
print(X_train)

[[5.5493950e+04 1.0305749e+05 2.1463481e+05 9.6778920e+04 0.0000000e+00
  1.0000000e+00]
 [4.6014020e+04 8.5047440e+04 2.0551764e+05 9.6479510e+04 0.0000000e+00
  0.0000000e+00]
 [7.5328870e+04 1.4413598e+05 1.3405007e+05 1.0573354e+05 0.0000000e+00
  1.0000000e+00]
 [4.6426070e+04 1.5769392e+05 2.1079767e+05 9.6712800e+04 1.0000000e+00
  0.0000000e+00]
 [9.1749160e+04 1.1417579e+05 2.9491957e+05 1.2426690e+05 0.0000000e+00
  1.0000000e+00]
 [1.3029813e+05 1.4553006e+05 3.2387668e+05 1.5575260e+05 0.0000000e+00
  1.0000000e+00]
 [1.1994324e+05 1.5654742e+05 2.5651292e+05 1.3260265e+05 0.0000000e+00
  1.0000000e+00]
 [1.0002300e+03 1.2415304e+05 1.9039300e+03 6.4926080e+04 0.0000000e+00
  0.0000000e+00]
 [5.4205000e+02 5.1743150e+04 0.0000000e+00 3.5673410e+04 0.0000000e+00
  0.0000000e+00]
 [6.5605480e+04 1.5303206e+05 1.0713838e+05 1.0100464e+05 0.0000000e+00
  0.0000000e+00]
 [1.1452361e+05 1.2261684e+05 2.6177623e+05 1.2991704e+05 0.0000000e+00
  0.0000000e+00]
 [6.1994480e+04 1.156

In [66]:
print(X_test)

[[6.6051520e+04 1.8264556e+05 1.1814820e+05 1.0328238e+05 0.0000000e+00
  1.0000000e+00]
 [1.0067196e+05 9.1790610e+04 2.4974455e+05 1.4425940e+05 1.0000000e+00
  0.0000000e+00]
 [1.0191308e+05 1.1059411e+05 2.2916095e+05 1.4612195e+05 0.0000000e+00
  1.0000000e+00]
 [2.7892920e+04 8.4710770e+04 1.6447071e+05 7.7798830e+04 0.0000000e+00
  1.0000000e+00]
 [1.5344151e+05 1.0114555e+05 4.0793454e+05 1.9105039e+05 0.0000000e+00
  1.0000000e+00]
 [7.2107600e+04 1.2786455e+05 3.5318381e+05 1.0500831e+05 0.0000000e+00
  0.0000000e+00]
 [2.0229590e+04 6.5947930e+04 1.8526510e+05 8.1229060e+04 0.0000000e+00
  0.0000000e+00]
 [6.1136380e+04 1.5270192e+05 8.8218230e+04 9.7483560e+04 0.0000000e+00
  0.0000000e+00]
 [7.3994560e+04 1.2278275e+05 3.0331926e+05 1.1035225e+05 0.0000000e+00
  1.0000000e+00]
 [1.4210734e+05 9.1391770e+04 3.6616842e+05 1.6618794e+05 0.0000000e+00
  1.0000000e+00]]


In [67]:
print(y_train)

[0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0.]


In [68]:
print(y_test)

[0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]


In [71]:
# ISSUE_4 - Training on Train set - use linear regression

# Initializing the Linear Regression Model.
regressor = LinearRegression()

# Training the Model using the Training Datasets X_train & y_train.
regressor.fit(X_train, y_train)


In [None]:
# ISSUE_5 - Testing on test set - with trained linear regression




In [None]:
# ISSUE_6 - Measuring the performance - r2




In [None]:
# ISSUE_7 - print coefficient/weight of the trained model



In [None]:
# ISSUE_8 - now predict values based on trained linear regression model



In [None]:
# ISSUE_9 - plot predicted and actual values




In [None]:
# ISSUE_10 - Use classification_report to compute metrics like Precision, Recall, and F1-Score