# <font color=blue>AppDev_Summative </font>

### Import neccessary libraries

In [1]:
from sklearn import linear_model
import pandas as pd

### Load data from all data files

In [2]:
dSolar_1 = pd.read_csv('solar_farm.csv')
dSolar_2 = pd.read_csv('solar_generation_data.csv')
dWind_1 = pd.read_csv('wind_farm.csv')
dWind_2 = pd.read_csv('wind_generation_data.csv')

In [3]:
print(dSolar_2.shape, dWind_2.shape)

(365, 8) (366, 3)


In [4]:
dSolar_1.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,Date Of Month,Capacity Available
1,4,3
2,6,5
3,19,2
4,23,50


### View first 5 datasets from the Solar dataframes

In [5]:
dSolar_2.head()

Unnamed: 0,Month,Day,Temp Hi,Temp Low,Solar,Cloud Cover Percentage,Rainfall in mm,Power Generated in MW
0,Jan,1,109°,85°,30.0,9,0.0,9.93
1,Jan,2,106°,71°,30.1,9,0.0,9.97
2,Jan,3,106°,81°,29.5,9,0.0,9.77
3,Jan,4,102°,83°,13.0,4,0.0,4.3
4,Jan,5,105°,80°,30.1,9,0.0,9.97


In [6]:
#remove the degree symbol
dSolar_2['Temp Hi'] = dSolar_2['Temp Hi'].replace('\u00b0','', regex=True)
dSolar_2['Temp Low'] = dSolar_2['Temp Low'].replace('\u00b0','', regex=True)

In [7]:
dSolar_2.head()

Unnamed: 0,Month,Day,Temp Hi,Temp Low,Solar,Cloud Cover Percentage,Rainfall in mm,Power Generated in MW
0,Jan,1,109,85,30.0,9,0.0,9.93
1,Jan,2,106,71,30.1,9,0.0,9.97
2,Jan,3,106,81,29.5,9,0.0,9.77
3,Jan,4,102,83,13.0,4,0.0,4.3
4,Jan,5,105,80,30.1,9,0.0,9.97


In [8]:
#confirm all column names for solar data
dSolar_2.columns

Index(['Month ', 'Day', 'Temp Hi', 'Temp Low', 'Solar',
       'Cloud Cover Percentage', 'Rainfall in mm', 'Power Generated in MW'],
      dtype='object')

In [9]:
#check column data types
dSolar_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
Month                     365 non-null object
Day                       365 non-null int64
Temp Hi                   365 non-null object
Temp Low                  365 non-null object
Solar                     365 non-null float64
Cloud Cover Percentage    365 non-null int64
Rainfall in mm            312 non-null float64
Power Generated in MW     365 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 22.9+ KB


In [10]:
#change Temp Hi and Temp Low to numeric
cols=[i for i in dSolar_2.columns if i in ['Temp Hi', 'Temp Low']]
for col in cols:
    dSolar_2[col]=pd.to_numeric(dSolar_2[col], errors='coerce')
dSolar_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
Month                     365 non-null object
Day                       365 non-null int64
Temp Hi                   365 non-null int64
Temp Low                  365 non-null int64
Solar                     365 non-null float64
Cloud Cover Percentage    365 non-null int64
Rainfall in mm            312 non-null float64
Power Generated in MW     365 non-null float64
dtypes: float64(3), int64(4), object(1)
memory usage: 22.9+ KB


In [11]:
#check for any missing values
dSolar_2.isnull().sum()

Month                      0
Day                        0
Temp Hi                    0
Temp Low                   0
Solar                      0
Cloud Cover Percentage     0
Rainfall in mm            53
Power Generated in MW      0
dtype: int64

In [12]:
#drop remaining missing values
dSolar_clean = dSolar_2.dropna()
dSolar_clean.isnull().sum()

Month                     0
Day                       0
Temp Hi                   0
Temp Low                  0
Solar                     0
Cloud Cover Percentage    0
Rainfall in mm            0
Power Generated in MW     0
dtype: int64

### Create a simple ML model which accepts suitable inputs and gives a predicted power output for each power generation plant for any day within the next 7 days. Note that these may be 2 ML models (1 for each plant)

### ML model using solar data

### Split data into training and test sets

In [13]:
X = dSolar_clean.drop(['Month ', 'Power Generated in MW'], axis = 1).values # X are the input (or independent) variables
y = dSolar_clean['Power Generated in MW'].values # Y is output (or dependent) variable

A rule of thumb is to split data into training and test sets by 80/20 or 70/30.

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Fit the model

The `fit()` function fits a linear model. We'll fit the model on the training data.

In [16]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

### Make Predictions

We'll predict the dependent variable using the linear model we fitted with the test dataset.

In [17]:
y_pred = lm.predict(X_test)

In [18]:
print(y_pred[0:5]) # print the first 5 predictions

[6.22525027 9.04051926 2.0850564  5.9609307  5.36346187]


In [19]:
#side by side of actual values and predicated values
#predict banking crisis
y_pred = lm.predict(X_test)

#connect predictions with actual banking crisis values
for i in range(10):
    print(y_test[i], y_pred[i])

6.23 6.225250267632097
9.04 9.04051926091119
2.09 2.085056396917299
5.96 5.960930695350832
5.36 5.363461865506897
5.96 5.960566194827172
5.79 5.793951927365789
8.25 8.245283484727683
7.09 7.086535452400747
6.79 6.787343020124801


### View first 5 datasets from the Solar dataframes

In [20]:
dWind_2.head()

Unnamed: 0,wind speed,direction,Power Output
0,16.0,218,34.76
1,15.91,218,36.59
2,15.82,218,32.35
3,15.73,218,39.37
4,15.64,218,33.22


In [21]:
#confirm all column names for wind data
dWind_2.columns

Index(['wind speed', 'direction', 'Power Output'], dtype='object')

In [22]:
#check column datatypes
dWind_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 3 columns):
wind speed      366 non-null float64
direction       366 non-null int64
Power Output    366 non-null float64
dtypes: float64(2), int64(1)
memory usage: 8.7 KB


In [23]:
#check for any missing values
dWind_2.isnull().sum()

wind speed      0
direction       0
Power Output    0
dtype: int64

### ML model using wind data

### Split data into training and test sets

In [24]:
X1 = dWind_2.drop(['Power Output'], axis = 1).values # X are the input (or independent) variables
y1 = dWind_2['Power Output'].values # Y is output (or dependent) variable

In [25]:
# create training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

### Fit the model

In [26]:
lm = linear_model.LinearRegression()
model = lm.fit(X1_train,y1_train)

### Make Predictions

In [27]:
y1_pred = lm.predict(X1_test)

In [28]:
print(y1_pred[0:5]) # print the first 5 predictions

[42.43822584 33.4853154  34.92351682 34.72774602 43.24066278]


In [29]:
#side by side of actual values and predicated values
#predict banking crisis
y1_pred = lm.predict(X1_test)

#connect predictions with actual banking crisis values
for i in range(10):
    print(y1_test[i], y1_pred[i])

40.52 42.43822583945581
24.37 33.485315400250215
31.13 34.923516822366366
38.14 34.72774602425082
45.92 43.240662782339996
46.97 42.28799729277921
30.19 37.40062979931575
44.56 44.28539400406589
37.23 40.413693252280154
41.38 44.84942773084188
