# Rain in Australia
    
    Predict next-day rain in Australia 

# Context

    Predict next-day rain by training classification models on the target variable RainTomorrow.

# Content

    This dataset contains about 10 years of daily weather observations from many locations across Australia.

    RainTomorrow is the target variable to predict. It means -- did it rain the next day, Yes or No? This column is Yes if the rain for that day was 1mm or more.

# Source & Acknowledgements

    Observations were drawn from numerous weather stations. The daily observations are available from http://www.bom.gov.au/climate/data.
    An example of latest weather observations in Canberra: http://www.bom.gov.au/climate/dwo/IDCJDW2801.latest.shtml

    Definitions adapted from http://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml
**Data source:** http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data.

    Copyright Commonwealth of Australia 2010, Bureau of Meteorology.



In [1]:
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv(r"D:\data\Aus\weatherAUS.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [6]:
def Fill_null_values(df,float_columns,str_columns,wind_list,rain_list,choice,Wind_dir):
    
    for float_null_value in float_columns:
        for j in range(df[float_null_value].isnull().sum()):
            value = round(random.uniform(df[float_null_value].max(),df[float_null_value].min()),1)
            df[float_null_value].fillna(value,inplace=True)
            
    for str_null_value in str_columns[2:]:
        if str_null_value in wind_list:
            for a in range(df[str_null_value].isnull().sum()):
                null = random.randint(0,len(Wind_Dir)-1)
                df[str_null_value].fillna(Wind_Dir[null],inplace= True)
        elif str_null_value in rain_list:
             for a in range(df[str_null_value].isnull().sum()):
                null = random.randint(0,len(choice)-1)
                df[str_null_value].fillna(choice[null],inplace= True)
    
    return df.isnull().sum()

In [None]:
str_columns = df.select_dtypes(include=['object']).columns
float_columns = df.select_dtypes(include=['float']).columns

choice = ['Yes',"No"]
rain_list = ['RainToday','RainTomorrow']
wind_list = ['WindGustDir','WindDir9am','WindDir3pm']
Wind_Dir = ['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', 'ENE','SSE', 'S','NW', 'SE', 'ESE', 'E', 'SSW']

Fill_null_values(df=df,
                 float_columns=float_columns,
                 str_columns=str_columns,
                 wind_list=wind_list,
                 rain_list=rain_list,
                 choice=choice,
                 Wind_dir=Wind_Dir)

In [None]:
df.info()

In [None]:
def encode(df):
    df1 = df
    encode_columns = list(df.select_dtypes(include=['object']))
    led = LabelEncoder()
    for columns in encode_columns:
        try:
            df[columns] = led.fit_transform(df[columns])
        except:
            print('Some Columns have null values or it not a Object dtype')
    return df

In [None]:
encode(df=df)

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['RainTomorrow']),
                                                 df.RainTomorrow,
                                                test_size=0.3,
                                                random_state=42)

In [None]:
X_train.head()

In [None]:
from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(X_train,y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report

In [None]:
%time 
model = LogisticRegression()
model.fit(X_train,y_train)
prd = model.predict(X_test)
print(classification_report(y_test,prd))

In [None]:
%time 
model = KNeighborsClassifier()
model.fit(X_train,y_train)
prd = model.predict(X_test)
print(classification_report(y_test,prd))

In [None]:
%time 
model = SVC(kernel='linear')
model.fit(X_train,y_train)
prd = model.predict(X_test)
print(classification_report(y_test,prd))

In [None]:
%time 
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)
prd = model.predict(X_test)
print(classification_report(y_test,prd))