# Setting Up Data

In [22]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

In [23]:
# Load in data
data = pd.read_csv('Resources/Data.csv')
data.head()

Unnamed: 0,address,city,zip,utility_provider,average_bill,number_of_panels,panel_model,principal,after_taxcredit,monthly_rate,loan_term(months),montly_payment,if_cash,kW,add_battery,net_with_battery,monthly_battery
0,713 East 9th Street,San Juan,78589,AEP,199.92,34,SunPower,34639.2,25633.01,0.0004,240,112.03,0.0,12.24,48639.2,35993.01,157.31
1,705 Dellwood Drive,Laredo,78045,AEP,217.56,37,SunPower,37695.6,27894.74,0.0004,240,121.92,0.0,13.32,51695.6,38254.74,167.2
2,4310 Boots Drive,Killeen,76549,Oncor,211.68,36,SunPower,36676.8,27140.83,0.0004,240,118.62,0.0,12.96,50676.8,37500.83,163.9
3,3307 Begay,Laredo,78046,AEP,323.4,55,SunPower,56034.0,41465.16,0.0004,240,181.23,0.0,19.8,70034.0,51825.16,226.51
4,11112 Entrada Loop,Laredo,78045,AEP,235.2,40,SunPower,40752.0,30156.48,0.0004,240,131.8,0.0,14.4,54752.0,40516.48,177.09


In [24]:
# Get the necessary data to determine the number of panels needed
necessary_data = data.drop(['address', 'principal', 'after_taxcredit', 'monthly_rate', 'loan_term(months)',
                           'montly_payment', 'if_cash', 'kW', 'add_battery', 'net_with_battery', 'monthly_battery'],
                          axis = 1)
necessary_data.head()

Unnamed: 0,city,zip,utility_provider,average_bill,number_of_panels,panel_model
0,San Juan,78589,AEP,199.92,34,SunPower
1,Laredo,78045,AEP,217.56,37,SunPower
2,Killeen,76549,Oncor,211.68,36,SunPower
3,Laredo,78046,AEP,323.4,55,SunPower
4,Laredo,78045,AEP,235.2,40,SunPower


In [25]:
# Use the get_dummies method on non float columns City, Electric Utility, and panel_model
clean_data = pd.get_dummies(necessary_data, columns=['city', 'utility_provider', 'panel_model'])
clean_data.head()

Unnamed: 0,zip,average_bill,number_of_panels,city_Harlingen,city_Killeen,city_Laredo,city_Los Fresnos,city_San Juan,city_Weslaco,city_Zapata,utility_provider_AEP,utility_provider_Oncor,panel_model_REC,panel_model_SunPower
0,78589,199.92,34,0,0,0,0,1,0,0,1,0,0,1
1,78045,217.56,37,0,0,1,0,0,0,0,1,0,0,1
2,76549,211.68,36,0,1,0,0,0,0,0,0,1,0,1
3,78046,323.4,55,0,0,1,0,0,0,0,1,0,0,1
4,78045,235.2,40,0,0,1,0,0,0,0,1,0,0,1


# Begin Multiple Linear Regression Model

In [27]:
# Set the target and features
y = clean_data['number_of_panels']
X = clean_data.drop(['number_of_panels'], axis = 1)

In [28]:
# Splitting into Train and Test sets into an 80/20 split.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=85)

X_train.shape

(556, 13)

In [29]:
# Training the data using linear regression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [30]:
# Make predictions
y_prediction = model.predict(X_test)
y_prediction

array([40., 55., 55., 44., 30., 25., 33., 43., 33., 46., 32., 46., 29.,
       35., 57., 40., 50., 19., 49., 11., 55., 25., 33., 26., 38., 44.,
       58., 35., 30., 28., 11., 30., 30., 44., 38., 24., 36., 30., 30.,
       40., 32., 20., 35., 38., 29., 50., 24., 30., 28., 30., 37., 50.,
       23., 46., 43., 20., 22., 56., 38., 43., 30., 43., 36., 40., 34.,
       22., 45., 22., 49., 30., 24., 21., 35., 48., 36., 36., 27., 26.,
       11., 18., 23., 32., 57., 41., 45., 46., 46., 25., 60., 32., 27.,
       33., 28., 37., 28., 35., 52., 35., 57., 25., 55., 18., 34., 40.,
       26., 28., 32., 30., 50., 50., 55., 46., 30., 44., 32., 30., 32.,
       23., 30., 41., 38., 20., 35., 20., 60., 36., 35., 36., 41., 27.,
       42., 36., 38., 60., 42., 41., 49., 33., 40., 32.])

In [31]:
# importing r2_score module
# Code received from https://www.analyticsvidhya.com/blog/2021/05/multiple-linear-regression-using-python-and-scikit-learn/
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np
# predicting the accuracy score
score=r2_score(y_test,y_prediction)
print('r2_score is',score)
print('mean_sqrd_error is==',mean_squared_error(y_test,y_prediction))
print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_prediction)))

r2_score is 1.0
mean_sqrd_error is== 4.2551157227621375e-28
root_mean_squared error of is== 2.0627931846799712e-14
