# Import required libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import numpy as np
from xgboost import XGBClassifier

# Get rid of null target column values and use them to test our model later

In [2]:
df=pd.read_csv(r"weatherAUS.csv")
test=df[df.RainToday.isna()|df.RainTomorrow.isna()]
labels=df[~((df.RainToday.isna())|(df.RainTomorrow.isna()))].RainTomorrow
train=df[~((df.RainToday.isna())|(df.RainTomorrow.isna()))].drop(["RainTomorrow","Date"],axis=1)
print(train.Location.unique())

['Albury' 'BadgerysCreek' 'Cobar' 'CoffsHarbour' 'Moree' 'Newcastle'
 'NorahHead' 'NorfolkIsland' 'Penrith' 'Richmond' 'Sydney' 'SydneyAirport'
 'WaggaWagga' 'Williamtown' 'Wollongong' 'Canberra' 'Tuggeranong'
 'MountGinini' 'Ballarat' 'Bendigo' 'Sale' 'MelbourneAirport' 'Melbourne'
 'Mildura' 'Nhil' 'Portland' 'Watsonia' 'Dartmoor' 'Brisbane' 'Cairns'
 'GoldCoast' 'Townsville' 'Adelaide' 'MountGambier' 'Nuriootpa' 'Woomera'
 'Albany' 'Witchcliffe' 'PearceRAAF' 'PerthAirport' 'Perth' 'SalmonGums'
 'Walpole' 'Hobart' 'Launceston' 'AliceSprings' 'Darwin' 'Katherine'
 'Uluru']


In [3]:
d = (train.dtypes == 'float64')
float_cols = list(d[d].index)
for i in float_cols:
    print(i)
    avg=train[i].mean()
    train[i].fillna(value=avg,inplace=True)
X_train,X_val,y_train,y_val=train_test_split(train,labels,train_size=0.95)
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print(object_cols)

print(X_train.isna().sum())
print(X_val.isna().sum())
print(y_val.isna().sum())
print(y_train.isna().sum())

MinTemp
MaxTemp
Rainfall
Evaporation
Sunshine
WindGustSpeed
WindSpeed9am
WindSpeed3pm
Humidity9am
Humidity3pm
Pressure9am
Pressure3pm
Cloud9am
Cloud3pm
Temp9am
Temp3pm
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
Evaporation         0
Sunshine            0
WindGustDir      8729
WindGustSpeed       0
WindDir9am       9182
WindDir3pm       3499
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday           0
dtype: int64
Location           0
MinTemp            0
MaxTemp            0
Rainfall           0
Evaporation        0
Sunshine           0
WindGustDir      434
WindGustSpeed      0
WindDir9am       478
WindDir3pm       171
WindSpeed9am       0
WindSpeed3pm       0
Humidity9am        0
Humidity3pm    

# OneHot

In [4]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# OneHot encode the training and val dataset categorical columns
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_val = pd.DataFrame(OH_encoder.transform(X_val[object_cols]))
# OneHot encode our 1D Target feratures
final_y_train= pd.DataFrame(OH_encoder.fit_transform(pd.DataFrame(y_train)))
final_y_val= pd.DataFrame(OH_encoder.transform(pd.DataFrame(y_val)))
# Restore lost index series to to all data
final_y_train.index=y_train.index
final_y_val.index=y_val.index
OH_cols_train.index=X_train.index
OH_cols_val.index=X_val.index
# Drop old categorical columns and concat with the OneHot encoded dataframe
#print(X_train.head())
X_train=X_train.drop(object_cols,axis=1)
#print(X_train.head())
X_val=X_val.drop(object_cols,axis=1)
final_X_train=X_train.join(OH_cols_train)
final_X_val=X_val.join(OH_cols_val)
print(len(final_X_train))
print(len(final_y_train))
print(len(final_X_val))
print(len(final_y_val))

133747
133747
7040
7040


In [5]:
def arg(l):
    arr=[]
    for i in l.values.tolist():
        arr.append(np.argmax(i))
    return np.array(arr)

In [6]:
def arg2(l):
    arr=[]
    for i in l:
        arr.append(np.argmax(i))
    return np.array(arr)

In [7]:
def score_dataset_RandomForest_OneHot(X_train, X_valid, y_train, y_valid,n):
    model = RandomForestClassifier(n_estimators=n, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds)

In [8]:
def score_dataset_RandomForest(X_train, X_valid, y_train, y_valid,n):
    model = RandomForestClassifier(n_estimators=n, random_state=0)
    model.fit(X_train, arg(y_train))
    preds = model.predict(X_valid)
    return accuracy_score(arg(y_valid), arg2(preds))

In [9]:
def score_dataset_XGB(X_train, X_valid, y_train, y_valid,n):
    model = XGBClassifier(n_estimators=n, random_state=0,use_label_encoder=False)
    model.fit(X_train, arg(y_train))
    preds = model.predict(X_valid)
    print(len(preds))
    return accuracy_score(arg(y_valid), arg2(preds))

In [10]:
def score_dataset_GradientBoosting(X_train, X_valid, y_train, y_valid,n):
    model = GradientBoostingClassifier(n_estimators=n, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds)

In [11]:
n=200
print("{}% for {} estimators.".format(score_dataset_RandomForest_OneHot(final_X_train,final_X_val,final_y_train,final_y_val,n)*100,n))

85.68181818181819% for 200 estimators.
