In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from abc import abstractmethod
import scipy as sc
from ydata_profiling import ProfileReport
from scipy import stats as sts
import sklearn 
from sklearn.preprocessing import PowerTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
%matplotlib

Using matplotlib backend: module://matplotlib_inline.backend_inline


# Features description:

obj_ID = Object Identifier, the unique value that identifies the object in the image catalog used by the CAS

alpha = Right Ascension angle (at J2000 epoch)

delta = Declination angle (at J2000 epoch)

u = Ultraviolet filter in the photometric system

g = Green filter in the photometric system

r = Red filter in the photometric system

i = Near Infrared filter in the photometric system

z = Infrared filter in the photometric system

run_ID = Run Number used to identify the specific scan

rereun_ID = Rerun Number to specify how the image was processed

cam_col = Camera column to identify the scanline within the run

field_ID = Field number to identify each field

spec_obj_ID = Unique ID used for optical spectroscopic objects (this means that 2 different observations with the same spec_obj_ID must share the 
output class)

class = object class (galaxy, star or quasar object)

redshift = redshift value based on the increase in wavelength

plate = plate ID, identifies each plate in SDSS

MJD = Modified Julian Date, used to indicate when a given piece of SDSS data was taken

fiber_ID = fiber ID that identifies the fiber that pointed the light at the focal plane in each observation

In [126]:
df = pd.read_csv('data/weatherAUS.csv')
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [128]:
df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,143975.0,144199.0,142199.0,82670.0,75625.0,135197.0,143693.0,142398.0,142806.0,140953.0,130395.0,130432.0,89572.0,86102.0,143693.0,141851.0
mean,12.194034,23.221348,2.360918,5.468232,7.611178,40.03523,14.043426,18.662657,68.880831,51.539116,1017.64994,1015.255889,4.447461,4.50993,16.990631,21.68339
std,6.398495,7.119049,8.47806,4.193704,3.785483,13.607062,8.915375,8.8098,19.029164,20.795902,7.10653,7.037414,2.887159,2.720357,6.488753,6.93665
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.9,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7


In [129]:
df[df.isna().any()[df.isna().any() == True].index]

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2.8,23.4,0.0,,,E,31.0,SE,ENE,13.0,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,3.6,25.3,0.0,,,NNW,22.0,SE,N,13.0,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,5.4,26.9,0.0,,,N,37.0,SE,WNW,9.0,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,7.8,27.0,0.0,,,SE,28.0,SSE,N,13.0,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [130]:
# profile = ProfileReport(df)
# profile.to_file('report.html')

In [131]:
df = df.drop(index=42358)

In [132]:
df.dropna().describe()


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0
mean,13.46477,24.219206,2.130397,5.503135,7.735626,40.877366,15.667228,19.786778,65.874123,49.601985,1017.239505,1014.79558,4.241705,4.326515,18.204961,22.710333
std,6.416689,6.970676,7.014822,3.696282,3.758153,13.335232,8.317005,8.51018,18.513289,20.19704,6.909357,6.870892,2.797162,2.647251,6.567991,6.836543
min,-6.7,4.1,0.0,0.0,0.0,9.0,2.0,2.0,0.0,0.0,980.5,977.1,0.0,0.0,-0.7,3.7
25%,8.6,18.7,0.0,2.8,5.0,31.0,9.0,13.0,55.0,35.0,1012.7,1010.1,1.0,2.0,13.1,17.4
50%,13.2,23.9,0.0,5.0,8.6,39.0,15.0,19.0,67.0,50.0,1017.2,1014.7,5.0,5.0,17.8,22.4
75%,18.4,29.7,0.6,7.4,10.7,48.0,20.0,26.0,79.0,63.0,1021.8,1019.4,7.0,7.0,23.3,27.9
max,31.4,48.1,206.2,81.2,14.5,124.0,67.0,76.0,100.0,100.0,1040.4,1038.9,8.0,9.0,39.4,46.1


In [133]:
df.select_dtypes(include=['object']).isna().any()

Date            False
Location        False
WindGustDir      True
WindDir9am       True
WindDir3pm       True
RainToday        True
RainTomorrow     True
dtype: bool

In [134]:
df.select_dtypes(include=['object']) 

Unnamed: 0,Date,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,W,W,WNW,No,No
1,2008-12-02,Albury,WNW,NNW,WSW,No,No
2,2008-12-03,Albury,WSW,W,WSW,No,No
3,2008-12-04,Albury,NE,SE,E,No,No
4,2008-12-05,Albury,W,ENE,NW,No,No
...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,E,SE,ENE,No,No
145456,2017-06-22,Uluru,NNW,SE,N,No,No
145457,2017-06-23,Uluru,N,SE,WNW,No,No
145458,2017-06-24,Uluru,SE,SSE,N,No,No


In [135]:
df['WindGustDir'].unique()

array(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', nan, 'ENE',
       'SSE', 'S', 'NW', 'SE', 'ESE', 'E', 'SSW'], dtype=object)

In [136]:
def convert__wind_direction(x):
    if x == 'N':
        x = 0
    elif x  == 'NNE':
        x = 22.5
    elif x == 'NE':
        x = 45
    elif x == 'ENE':
        x = 77.5
    elif x  == 'E':
        x = 90
    elif x == 'ESE':
        x = 112.5
    elif x == 'SE':
        x = 135
    elif x  == 'SSE':
        x = 157.5
    elif x == 'S':
        x = 180
    elif x == 'SSW':
        x = 202.5
    elif x  == 'SW':
        x = 225
    elif x == 'WSW':
        x = 247.5
    elif x == 'W':
        x = 270
    elif x  == 'WNW':
        x = 292.5
    elif x == 'NW':
        x = 315
    elif x == 'NNW':
        x = 337.5
    else:
        x
    return x

def convert_binary(x):
    if x == 'Yes':
        x = 1
    elif x == 'No':
        x = 0
    else:
        return np.nan
    return x

In [137]:
df[['WindGustDir', 'WindDir9am', 'WindDir3pm']] = df[['WindGustDir', 'WindDir9am', 'WindDir3pm']].map(convert__wind_direction)

In [138]:
df[['RainToday', 'RainTomorrow']] = df[['RainToday', 'RainTomorrow']].map(convert_binary)

In [139]:
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,270.0,44.0,270.0,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0
1,2008-12-02,Albury,7.4,25.1,0.0,,,292.5,44.0,337.5,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0
2,2008-12-03,Albury,12.9,25.7,0.0,,,247.5,46.0,270.0,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0
3,2008-12-04,Albury,9.2,28.0,0.0,,,45.0,24.0,135.0,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0
4,2008-12-05,Albury,17.5,32.3,1.0,,,270.0,41.0,77.5,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,90.0,31.0,135.0,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,0.0,0.0
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,337.5,22.0,135.0,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,0.0,0.0
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,0.0,37.0,135.0,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,0.0,0.0
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,135.0,28.0,157.5,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,0.0,0.0
