# **Predictive Analytics**

- Import Library

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

- Load Data

Information : 
- Radiation : Radiasi Solar watts per meter^2
- Temperature : Suhu dalam derajat Fahrenheit
- Humidity : Percent
- Barometric Pressure : Hg
- Wind Direction : Degrees
- Wind Speed : miles per hour
- Sunrise & Sunset : hawai time

In [116]:
df = pd.read_csv('../data/raw/SolarPrediction.csv')

## Exploration Data Analysis

In [117]:
df.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00


In [None]:
# sns.heatmap(df.corr(), annot=True)

In [None]:
df.dtypes

UNIXTime                    int64
Data                       object
Time                       object
Radiation                 float64
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
TimeSunRise                object
TimeSunSet                 object
dtype: object

## Data Preprocessing

In [None]:
df = df.drop(columns=['UNIXTime','Data','Time','TimeSunRise','TimeSunSet'])
df['Temperature'] = df['Temperature'].astype(float)
df['Humidity'] = df['Humidity'].astype(float)

In [None]:
df.isna().sum()

Radiation                 0
Temperature               0
Pressure                  0
Humidity                  0
WindDirection(Degrees)    0
Speed                     0
dtype: int64

In [None]:
numeric =  df.select_dtypes(include="number").columns

In [None]:
for i in numeric:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outlier = df[(df[i] < lower) | (df[i] > upper)]
    print(f"Kolom {i} : {len(outlier)} outlier")

Kolom Radiation : 1965 outlier
Kolom Temperature : 89 outlier
Kolom Pressure : 1662 outlier
Kolom Humidity : 0 outlier
Kolom WindDirection(Degrees) : 1618 outlier
Kolom Speed : 479 outlier


In [None]:
def outlierhandling(series):
    Q1 = df[series].quantile(0.25)
    Q3 = df[series].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df.loc[df[series] > upper, series] = upper
    df.loc[df[series] < lower, series] = lower
    return df


In [None]:
for i in numeric:
    df = outlierhandling(i)
for i in numeric:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outlier = df[(df[i] < lower) | (df[i] > upper)]
    print(f"Kolom {i} : {len(outlier)} outlier")


Kolom Radiation : 0 outlier
Kolom Temperature : 0 outlier
Kolom Pressure : 0 outlier
Kolom Humidity : 0 outlier
Kolom WindDirection(Degrees) : 0 outlier
Kolom Speed : 0 outlier


In [None]:
X = df.drop('Radiation',axis=1)
y = df['Radiation']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
y.describe()

count    32686.000000
mean       201.354050
std        301.682337
min          1.110000
25%          1.230000
50%          2.660000
75%        354.235000
max        883.742500
Name: Radiation, dtype: float64

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modelling

In [None]:
model_linier = LinearRegression()
model_linier.fit(X_train_scaled, y_train)
y_pred_linier = model_linier.predict(X_test_scaled)
mse = mean_squared_error(y_test,y_pred_linier)
r2 = r2_score(y_test,y_pred_linier)
print(f"Mean Squared Error : {mse}")
print(f"R2 Score : {r2}")


Mean Squared Error : 39051.27811234309
R2 Score : 0.567344722914279


In [None]:
model_knn = KNeighborsRegressor(n_neighbors=5)

## Testing