# Import required libraries

In [337]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import numpy as np

# Get rid of null target column values and use them to test our model later

In [338]:
df=pd.read_csv(r"weatherAUS.csv")
df.dropna(inplace=True,axis=0)

labels=df[~((df.RainToday.isna())|(df.RainTomorrow.isna()))].RainTomorrow
train=df[~((df.RainToday.isna())|(df.RainTomorrow.isna()))].drop(["RainTomorrow","Date"],axis=1)
print(train.Location.unique())

['Cobar' 'CoffsHarbour' 'Moree' 'NorfolkIsland' 'Sydney' 'SydneyAirport'
 'WaggaWagga' 'Williamtown' 'Canberra' 'Sale' 'MelbourneAirport'
 'Melbourne' 'Mildura' 'Portland' 'Watsonia' 'Brisbane' 'Cairns'
 'Townsville' 'MountGambier' 'Nuriootpa' 'Woomera' 'PerthAirport' 'Perth'
 'Hobart' 'AliceSprings' 'Darwin']


In [339]:
X_train,X_val,y_train,y_val=train_test_split(train,labels,train_size=0.9)
s = (X_train.dtypes == 'object')
print(y_train.unique())
object_cols = list(s[s].index)
print(object_cols)
d = (X_train.dtypes == 'float64')
float_cols = list(d[d].index)
print(float_cols)

['Yes' 'No']
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']


# Impute

print(X_train.shape)
print(y_train.shape)
my_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train[float_cols]))
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val[float_cols]))
# Imputation removed column names; put them back
print(X_train.shape)
print(y_train.shape)
imputed_X_train.columns= X_train[float_cols].columns
imputed_X_val.columns = X_val[float_cols].columns

# concat object_cols
print(X_val.head)
print(y_val.shape)
X_train=X_train[object_cols].join(imputed_X_train)
X_val=X_val[object_cols].join(imputed_X_val)


# OneHot

In [340]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# OneHot encode the training and val dataset categorical columns
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_val = pd.DataFrame(OH_encoder.transform(X_val[object_cols]))
# OneHot encode our 1D Target feratures
final_y_train= pd.DataFrame(OH_encoder.fit_transform(pd.DataFrame(y_train)))
final_y_val= pd.DataFrame(OH_encoder.transform(pd.DataFrame(y_val)))
# Restore lost index series to to all data
final_y_train.index=y_train.index
final_y_val.index=y_val.index
OH_cols_train.index=X_train.index
OH_cols_val.index=X_val.index
# Drop old categorical columns and concat with the OneHot encoded dataframe
#print(X_train.head())
X_train=X_train.drop(object_cols,axis=1)
#print(X_train.head())
X_val=X_val.drop(object_cols,axis=1)
final_X_train=X_train.join(OH_cols_train)
final_X_val=X_val.join(OH_cols_val)
print(final_X_train.head())

        MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
62176      12.9     15.4       0.4          1.2       0.0           26.0   
130311      7.1     15.4       1.6          2.0       7.0           30.0   
76697      11.5     16.9       0.2          6.6       3.6           52.0   
23115      12.7     18.8       0.0          4.0       2.2           33.0   
80338       9.0     14.9       0.0          1.8       3.8           22.0   

        WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...   66   67  \
62176           13.0          13.0         92.0         91.0  ...  0.0  1.0   
130311          13.0          15.0         64.0         64.0  ...  0.0  0.0   
76697           28.0          31.0         78.0         75.0  ...  0.0  0.0   
23115           17.0          15.0         87.0         82.0  ...  0.0  0.0   
80338            9.0           7.0         93.0         58.0  ...  1.0  0.0   

         68   69   70   71   72   73   74   75  
62176   0.0  0.0  0

In [341]:
def score_dataset(n,X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds)

In [342]:

print("{} for {} estimators.".format(score_dataset(j,final_X_train,final_X_val,final_y_train,final_y_val)*100,100))