# <font color=blue>AppDev_Summative </font>

### Import neccessary libraries

In [1]:
from sklearn import linear_model
import pandas as pd

### Load data from all data files

In [2]:
dSolar_1 = pd.read_csv('solar_farm.csv')
dSolar_2 = pd.read_csv('solar_generation_data.csv')
dWind_1 = pd.read_csv('wind_farm.csv')
dWind_2 = pd.read_csv('wind_generation_data.csv')

In [3]:
print(dSolar_2.shape, dWind_2.shape)

(365, 8) (366, 3)


In [4]:
dWind_1.head()

Unnamed: 0,Date Of Month,Capacity Available as %
0,3,70
1,5,60
2,7,50
3,8,45
4,15,55


In [5]:
dSolar_1.head()

Unnamed: 0,Date Of Month,Capacity Available
0,4,3
1,6,5
2,19,2
3,23,50
4,24,20


### View first 5 datasets from the Solar dataframes

In [6]:
dSolar_2.head()

Unnamed: 0,Month,Day,Temp Hi,Temp Low,Solar,Cloud Cover Percentage,Rainfall in mm,Power Generated in MW
0,Jan,1,109°,85°,30.0,9,0.0,9.93
1,Jan,2,106°,71°,30.1,9,0.0,9.97
2,Jan,3,106°,81°,29.5,9,0.0,9.77
3,Jan,4,102°,83°,13.0,4,0.0,4.3
4,Jan,5,105°,80°,30.1,9,0.0,9.97


In [7]:
#remove the degree symbol
dSolar_2['Temp Hi'] = dSolar_2['Temp Hi'].replace('\u00b0','', regex=True)
dSolar_2['Temp Low'] = dSolar_2['Temp Low'].replace('\u00b0','', regex=True)

In [8]:
dSolar_2.head()

Unnamed: 0,Month,Day,Temp Hi,Temp Low,Solar,Cloud Cover Percentage,Rainfall in mm,Power Generated in MW
0,Jan,1,109,85,30.0,9,0.0,9.93
1,Jan,2,106,71,30.1,9,0.0,9.97
2,Jan,3,106,81,29.5,9,0.0,9.77
3,Jan,4,102,83,13.0,4,0.0,4.3
4,Jan,5,105,80,30.1,9,0.0,9.97


In [9]:
#confirm all column names for solar data
dSolar_2.columns

Index(['Month ', 'Day', 'Temp Hi', 'Temp Low', 'Solar',
       'Cloud Cover Percentage', 'Rainfall in mm', 'Power Generated in MW'],
      dtype='object')

In [10]:
#check column data types
dSolar_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
Month                     365 non-null object
Day                       365 non-null int64
Temp Hi                   365 non-null object
Temp Low                  365 non-null object
Solar                     365 non-null float64
Cloud Cover Percentage    365 non-null int64
Rainfall in mm            312 non-null float64
Power Generated in MW     365 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 22.9+ KB


In [11]:
#change Temp Hi and Temp Low to numeric
cols=[i for i in dSolar_2.columns if i in ['Temp Hi', 'Temp Low']]
for col in cols:
    dSolar_2[col]=pd.to_numeric(dSolar_2[col], errors='coerce')
dSolar_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
Month                     365 non-null object
Day                       365 non-null int64
Temp Hi                   365 non-null int64
Temp Low                  365 non-null int64
Solar                     365 non-null float64
Cloud Cover Percentage    365 non-null int64
Rainfall in mm            312 non-null float64
Power Generated in MW     365 non-null float64
dtypes: float64(3), int64(4), object(1)
memory usage: 22.9+ KB


In [12]:
#check for any missing values
dSolar_2.isnull().sum()

Month                      0
Day                        0
Temp Hi                    0
Temp Low                   0
Solar                      0
Cloud Cover Percentage     0
Rainfall in mm            53
Power Generated in MW      0
dtype: int64

In [33]:
#drop remaining missing values
dSolar_clean = dSolar_2.dropna()
dSolar_clean.isnull().sum()
dSolar_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63 entries, 0 to 62
Data columns (total 9 columns):
Month                     63 non-null object
Day                       63 non-null int64
Temp Hi                   63 non-null int64
Temp Low                  63 non-null int64
Solar                     63 non-null float64
Cloud Cover Percentage    63 non-null int64
Rainfall in mm            63 non-null float64
Power Generated in MW     63 non-null float64
Solar_predictions         63 non-null float64
dtypes: float64(4), int64(4), object(1)
memory usage: 4.9+ KB


### Create a simple ML model which accepts suitable inputs and gives a predicted power output for each power generation plant for any day within the next 7 days. Note that these may be 2 ML models (1 for each plant)

### ML model using solar data

### Split data into training and test sets

In [35]:
X = dSolar_clean.drop(['Month ', 'Power Generated in MW'], axis = 1).values # X are the input (or independent) variables
y = dSolar_clean['Power Generated in MW'].values # Y is output (or dependent) variable

array([[  1.        , 109.        ,  85.        ,  30.        ,
          9.        ,   0.        ,   8.97527571],
       [  2.        , 106.        ,  71.        ,  30.1       ,
          9.        ,   0.        ,   5.49547145],
       [  3.        , 106.        ,  81.        ,  29.5       ,
          9.        ,   0.        ,   8.34453495],
       [  4.        , 102.        ,  83.        ,  13.        ,
          4.        ,   0.        ,   6.19193393],
       [  5.        , 105.        ,  80.        ,  30.1       ,
          9.        ,   0.        ,   6.29154239],
       [  6.        , 107.        ,  84.        ,  26.3       ,
          8.        ,   0.        ,   6.522741  ],
       [  7.        , 104.        ,  82.        ,  30.2       ,
          9.        ,   0.        ,   3.24461867],
       [  8.        , 100.        ,  69.        ,  29.8       ,
          9.        ,   0.        ,   5.96102591],
       [  9.        , 103.        ,  81.        ,  30.1       ,
          9.    

A rule of thumb is to split data into training and test sets by 80/20 or 70/30.

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Fit the model

The `fit()` function fits a linear model. We'll fit the model on the training data.

In [17]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

### Make Predictions

We'll predict the dependent variable using the linear model we fitted with the test dataset.

In [18]:
y_pred = lm.predict(X_test)

In [19]:
print(y_pred[0:5]) # print the first 5 predictions

[8.97527571 5.49547145 8.34453495 6.19193393 6.29154239]


In [20]:
#side by side of actual values and predicated values
y_pred = lm.predict(X_test)

#connect predictions with actual banking crisis values
for i in range(10):
    print(y_test[i], y_pred[i])

8.97 8.97527570942345
5.5 5.495471445019957
8.34 8.344534950747672
6.19 6.191933930745168
6.29 6.291542387083749
6.52 6.522740999316237
3.25 3.2446186710707936
5.96 5.96102591161664
8.54 8.542860606207594
8.91 8.906687087742071


In [21]:
#add predictions column to the dataFrame
predictions = pd.DataFrame(y_pred)
dSolar_2['Solar_predictions'] = predictions
dSolar_2.head(10)

Unnamed: 0,Month,Day,Temp Hi,Temp Low,Solar,Cloud Cover Percentage,Rainfall in mm,Power Generated in MW,Solar_predictions
0,Jan,1,109,85,30.0,9,0.0,9.93,8.975276
1,Jan,2,106,71,30.1,9,0.0,9.97,5.495471
2,Jan,3,106,81,29.5,9,0.0,9.77,8.344535
3,Jan,4,102,83,13.0,4,0.0,4.3,6.191934
4,Jan,5,105,80,30.1,9,0.0,9.97,6.291542
5,Jan,6,107,84,26.3,8,0.0,8.71,6.522741
6,Jan,7,104,82,30.2,9,0.0,10.0,3.244619
7,Jan,8,100,69,29.8,9,0.0,9.87,5.961026
8,Jan,9,103,81,30.1,9,0.0,9.97,8.542861
9,Jan,10,104,82,27.1,9,0.0,8.97,8.906687


### View first 5 datasets from the Solar dataframes

In [22]:
dWind_2.head()

Unnamed: 0,wind speed,direction,Power Output
0,16.0,218,34.76
1,15.91,218,36.59
2,15.82,218,32.35
3,15.73,218,39.37
4,15.64,218,33.22


In [23]:
#confirm all column names for wind data
dWind_2.columns

Index(['wind speed', 'direction', 'Power Output'], dtype='object')

In [24]:
#check column datatypes
dWind_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 3 columns):
wind speed      366 non-null float64
direction       366 non-null int64
Power Output    366 non-null float64
dtypes: float64(2), int64(1)
memory usage: 8.7 KB


In [25]:
#check for any missing values
dWind_2.isnull().sum()

wind speed      0
direction       0
Power Output    0
dtype: int64

### ML model using wind data

### Split data into training and test sets

In [26]:
X1 = dWind_2.drop(['Power Output'], axis = 1).values # X are the input (or independent) variables
y1 = dWind_2['Power Output'].values # Y is output (or dependent) variable

In [27]:
# create training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

### Fit the model

In [28]:
lm = linear_model.LinearRegression()
model = lm.fit(X1_train,y1_train)

### Make Predictions

In [29]:
y1_pred = lm.predict(X1_test)

In [30]:
print(y1_pred[0:5]) # print the first 5 predictions

[35.64568919 28.97798071 44.32609372 41.89317172 41.99702496]


In [31]:
#side by side of actual values and predicated values
#predict banking crisis
y1_pred = lm.predict(X1_test)

#connect predictions with actual banking crisis values
for i in range(10):
    print(y1_test[i], y1_pred[i])

32.72 35.645689192585564
31.02 28.977980708964708
44.31 44.32609371588214
45.81 41.893171716274296
46.48 41.99702496322075
34.65 35.905926740234605
44.56 43.89430314072891
44.79 43.50227014354499
28.0 29.11050596886237
47.21 43.58564017122396


In [32]:
#add predictions column to the dataFrame
predictions = pd.DataFrame(y1_pred)
dWind_2['Wind_predictions'] = predictions
dWind_2.head(10)

Unnamed: 0,wind speed,direction,Power Output,Wind_predictions
0,16.0,218,34.76,35.645689
1,15.91,218,36.59,28.977981
2,15.82,218,32.35,44.326094
3,15.73,218,39.37,41.893172
4,15.64,218,33.22,41.997025
5,15.55,218,37.09,35.905927
6,15.46,218,41.23,43.894303
7,15.37,218,32.72,43.50227
8,15.28,218,40.39,29.110506
9,15.19,218,37.54,43.58564
