In [1]:
#import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Read and load data set
dataset = pd.read_csv('raw_data/50_Startups.csv')
#slice data frame to assign to axis
X = dataset.iloc[:,:-1] # include all rows , include all columns upto -1 but not inlcuding (no profits). This is also known as the 'feature' variables
y = dataset.iloc[:, -1].values # select last row only. This is also known as the 'target' variable
dataset.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


**Note :** The way the data set was split here is one of many methods. We could use dataset.drop(columns="Drop_output_label") for the X value and dataset["output_columns"] for the y value 

In [2]:
#To turn categorical data to numerical:
from sklearn.compose import ColumnTransformer # turn categorical data to numerical data as 0,1,2,3 etc
from sklearn.preprocessing import OneHotEncoder # turns each category into a column of 0s and 1s

# This is a Machine Learning workflow which is discussed later in the notes(added to github profile)
ct = ColumnTransformer(transformers=[('encoder' ,OneHotEncoder(),[3])], remainder = 'passthrough')
  
X = np.array(ct.fit_transform(X)) # takes data set X and tranform values of attributes to numeric and converts values to an array

 Note: You should encode before splitting the data set into training and test set. This is because you want to make sure that the same categories are encoded in both the training and test set. If you encode after splitting the data set, then you might end up with different categories in the training and test set. This is not good because you want to make sure that the machine learning model is trained on the same categories that it will be predicting on.

In [3]:
X

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

(0,0,1) now means new york

In [4]:
# Splitting data onto train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 42)
 # used function to split data and indicates 20% is test set

In [5]:
#training the model
from sklearn.linear_model import LinearRegression #imported LR class to apply to training set
regressor = LinearRegression() #assign a variable to LR class 
regressor.fit(X_train, y_train) #fit the training data set t LR class to train

In [6]:
#Predicting results for the test set
y_pred = regressor.predict(X_test)

In [7]:
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred}) #create a data frame to compare real values to predicted values for test set
df

Unnamed: 0,Real Values,Predicted Values
0,134307.35,126362.879083
1,81005.76,84608.453836
2,99937.59,99677.494252
3,64926.08,46357.460686
4,125370.37,128750.482885
5,35673.41,50912.417419
6,105733.54,109741.350327
7,107404.34,100643.242816
8,97427.84,97599.275746
9,122776.86,113097.425244


In [8]:
#how good is he accuracy of this model (RMSE in Python) 
#by comparing the outcome the model gave us and what the output should be
from sklearn import metrics
print((metrics.mean_squared_error(y_test,y_pred, squared=False)))

9055.957323492617


**Note:** The datasets for these models are very tiny and not realistic as there needs to be preprocessing(cleaning the data so the model is more effective) and in real life will be modelled with much much larger datasets

RMSE is an evaluation metric used to measure how good the model is for regression tasks(not necessarily accuracy as that is defined as something else.)