In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
df.head()

In [None]:
print(df.info())
print("----------------------------")
print(df.shape)

In [None]:
df.Date = df.Date.apply(pd.to_datetime)
df['month'] = df.Date.apply(lambda x: x.month)
df['day'] = df.Date.apply(lambda x: x.day)
df['year'] = df.Date.apply(lambda x: x.year)
df.drop(['Date'], 1, inplace = True)


In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                             .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
for feature in df.columns:
    if df[feature].dtype not in ['int64', 'float64']:    #dtype means datatype
        print(f"{feature}:{df[feature].unique()}")

In [None]:
df['Sunshine'].describe()

75% is 10.6 we can impute missing values with mean

In [None]:
df["Evaporation"].describe()

 75% is 7.4, we can impute missing values with median.

Similarly after checking I would impute missing values for cloud3pm, cloud9am

# Preprocessing

In [None]:
from sklearn.impute import SimpleImputer


imputer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer2 =SimpleImputer(missing_values = np.nan, strategy = 'median')
df[['Evaporation']] = imputer2.fit_transform(df[['Evaporation']])
df[['Sunshine']] = imputer1.fit_transform(df[['Sunshine']])
df[['Cloud3pm']] = imputer2.fit_transform(df[['Cloud3pm']])
df[['Cloud9am']] = imputer1.fit_transform(df[['Cloud9am']])
#mean and median are same for Pressure9am and Pressure3pm
df[['Pressure9am']] = imputer1.fit_transform(df[['Pressure9am']])
df[['Pressure3pm']] = imputer1.fit_transform(df[['Pressure3pm']])

In [None]:
df.isnull().sum()

In [None]:
#Now we can drop missing values and encode categricals
df = df.dropna()

#encoding categorical variables to numeric ones
from sklearn.preprocessing import LabelEncoder
for c in df.columns:
    if df[c].dtype=='object':    #Since we are encoding object datatype to integer/float
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)

In [None]:
df.head()

In [None]:
df.RainTomorrow.value_counts()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

df.hist(figsize=(20,16), color = 'r');
plt.show();  #showing the charts of different columns
#This also helps in finding number of counts in each column

Our target column **RainTomorrow** is highly **imbalanced** with No values 94906 and Yes value 26884

Also we would balance RainToday column

In [None]:
df.RainToday.value_counts()

In [None]:
zero  = df[df['RainTomorrow']==0]   #zero values in outcome column
one = df[df['RainTomorrow']==1]  # one values in outcome column
from sklearn.utils import resample
df_minority_upsampled = resample(one, replace = True, n_samples = 80537) 
#concatenate
df = pd.concat([zero, df_minority_upsampled])

from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence

In [None]:
zero  = df[df['RainToday']==0]   #zero values in outcome column
one = df[df['RainToday']==1]  # one values in outcome column
from sklearn.utils import resample
df_minority_upsampled = resample(one, replace = True, n_samples = 108000) 
#concatenate
df = pd.concat([zero, df_minority_upsampled])

from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence

In [None]:
df.hist(figsize=(20,16), color = 'r');
plt.show();

In [None]:
#Checking which columns are mostly correlated with the target
df.corr().abs()['RainTomorrow'].sort_values(ascending = False)

In [None]:
X = df.drop(['RainTomorrow'], axis = 1)
y = df['RainTomorrow']

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

fs = SelectKBest(score_func=f_classif, k=15)
# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)

In [None]:
# Get columns to keep and create new dataframe with those only
cols = fs.get_support(indices=True)
X_new = X.iloc[:,cols]

In [None]:
X_new

# Scaling and Splitting

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.25, random_state = 42)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model building

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

  # create model
model = Sequential()
model.add(Dense(1024, input_dim= 15, activation='relu'))
model.add(Dense(712, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

early_stopping = keras.callbacks.EarlyStopping( patience = 12, min_delta = 0.001,
                                               restore_best_weights =True )
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
# Fit the model
history = model.fit(X_train, y_train, epochs=50, batch_size=100, 
                     validation_data=(X_test, y_test),
                    verbose=1)
# evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


In [None]:
model.evaluate(X_test, y_test)

In [None]:
model.evaluate(X_train, y_train)

Not much difference between train and test results, thus no overfitting!

In [None]:
predictions =(model.predict(X_test)>0.5).astype("int32")
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

#### 93% Accuracy

## Upvote if you like it or fork it! This helps us motivate to produce more notebooks for the community 😊