***PART 1***

In [1]:
#importing required libraries
import pandas as pd
import numpy as np
from time import time
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import TomekLinks
from multiprocessing import Process
from time import time
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [2]:
#Loading the data and removing unwanted columns specified below
df = pd.read_csv('weatherAUS.csv')
df = df.drop(df.columns[0],axis = 1)#Dropping the Dates
df =df.drop(df.columns[-2], axis = 1)#Dropping the RISK_MM column
limitPer = len(df) * .7
df_2 = df.dropna(thresh=limitPer,axis=1)
data = df_2.copy()

In [3]:
#Any features with more than 30% of empty data are removed
limitPer = len(df_2) * .7
df_2 = df_2.dropna(thresh=limitPer,axis=1)
df_3 = df_2.copy()

In [4]:
#Performing imputation on missing values
#Using Median values to fill up missing numerical features - to avoid taking mean consisting of outliers
imp1 = SimpleImputer(strategy="median")
df_knn_imputed_float_columns = imp1.fit_transform(df_2[df_2.columns[df_2.dtypes=='float64']])
df_knn_imputed_float_columns = pd.DataFrame(df_knn_imputed_float_columns,columns = list(data.columns[data.dtypes=='float64']))

#Using most frequent values to fill up missing categorical features - to keep the process simple at this point
imp2 = SimpleImputer(strategy="most_frequent")
df_knn_imputed_categorical_columns = imp2.fit_transform(df[df.columns[df.dtypes!='float64']])
df_knn_imputed_categorical_columns = pd.DataFrame(df_knn_imputed_categorical_columns,columns = list(data.columns[data.dtypes!='float64']))

#Combining and arranging the above 2 numerical and categorical dataframes into 1
df_new = pd.concat([df_knn_imputed_float_columns, df_knn_imputed_categorical_columns], axis=1, sort=False)
df_new = df_new[data.columns]

#Converting each column's data type into the most appropriate one
df_new = df_new.convert_dtypes()

In [5]:
%%time
#One Hot Encoding each of the categorical types
from sklearn.preprocessing import OneHotEncoder
df_encoded = df_new.iloc[:,:-1]
encoding_info = {}#stores the individual column's transformational models so that it can be inversed later

# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

#One Hot encode the categorical features and join with the dataframe
for i in df_encoded.columns[df_encoded.dtypes=='string']:
    enc_df = pd.DataFrame(enc.fit_transform(df_new[[i]]).toarray())
    df_encoded = df_encoded.join(enc_df,how = 'left', lsuffix = '_left', rsuffix = '_right')
    encoding_info[i]=enc_df
df_encoded = df_encoded.select_dtypes(exclude=['string'])

#Label Encodeing for target feature
le = preprocessing.LabelEncoder()
df_encoded['Rain_Tomorrow'] = le.fit_transform(df_new.iloc[:,-1:])

CPU times: user 759 ms, sys: 196 ms, total: 956 ms
Wall time: 959 ms


  return f(**kwargs)


In [6]:
%%time
df_outlied = df_encoded
X = df_outlied
#Performing Isolation Forest algorithm for 2% outlier elimination
clf = IsolationForest(contamination = 0.02,n_jobs=-1)
clf.fit(X)
results = clf.predict(X)
outliers = X[results == -1]
normal = X[results == 1]
df_outlied = pd.DataFrame(normal,columns = list(df_outlied.columns))

CPU times: user 44.8 s, sys: 8.38 s, total: 53.2 s
Wall time: 41.5 s


In [7]:
#Splitting the data for training and testing
X,y = df_outlied.iloc[:,:-1], df_outlied.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
%%time
#Performing Min Max scaling of the data
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

CPU times: user 1.05 s, sys: 440 ms, total: 1.49 s
Wall time: 1.48 s


In [9]:
%%time
#Checking the imbalance ratio of 0
imbalance = y[y==0].count()
print("Imbalance ratio '0':all is",imbalance[0]/y.count()[0])

#Handling imbalance using oversampling - SMOTE
smote = SMOTE(n_jobs=-1)
start = time()
X_res, y_res = smote.fit_resample(X_train, y_train)
print("Elapsed time:",time()-start)
print("Before SMOTE Over Sampling:",y_train.value_counts(),'\n\n')
print("After SMOTE Over Sampling:",y_res.value_counts(),'\n\n')

Imbalance ratio '0':all is 0.7863350293148856
Elapsed time: 169.81941986083984
Before SMOTE Over Sampling: Rain_Tomorrow
0                87737
1                23742
dtype: int64 


After SMOTE Over Sampling: Rain_Tomorrow
1                87737
0                87737
dtype: int64 


CPU times: user 2min 49s, sys: 0 ns, total: 2min 49s
Wall time: 2min 49s


In [10]:
#function to test out the selected and tuned ML algorithms with given dataset
X_train,y_train = np.array(X_res),np.array(y_res).reshape(-1,)
X_test,y_test = np.array(X_test),np.array(y_test).reshape(-1,)
def run_models(X_train,y_train,X_test,y_test):
    QDA_best_params = {'reg_param': 0.001}
    LR_best_params = {'C': 10, 'penalty': 'l2'}
    ABC_best_params = {'learning_rate': 1.0, 'n_estimators': 500}
    Tuned_CLS = [(QuadraticDiscriminantAnalysis(**QDA_best_params),"Tuned QDA"),
                 (LogisticRegression(**LR_best_params),"Tuned LR"),
                 (AdaBoostClassifier(**ABC_best_params),"Tuned AB")]

    for clf,name in Tuned_CLS:
        start = time()
        print("\n\n",name)
        clf.fit(X_train,y_train)
        y_true, y_pred = y_test, clf.predict(X_test)
        print("Elapsed time:", time()-start)
        print(classification_report(y_true, y_pred))
        print()
        score = accuracy_score(y_test,y_pred)
        confusionmatrix = confusion_matrix(y_test,y_pred)
        print(confusionmatrix,name,score,'\n')

In [11]:
#Testing Just after SMOTE
run_models(X_train,y_train,X_test,y_test)



 Tuned QDA




Elapsed time: 4.676492929458618
              precision    recall  f1-score   support

           0       0.89      0.57      0.69     21838
           1       0.32      0.75      0.45      6032

    accuracy                           0.61     27870
   macro avg       0.61      0.66      0.57     27870
weighted avg       0.77      0.61      0.64     27870


[[12439  9399]
 [ 1525  4507]] Tuned QDA 0.6080373161105131 



 Tuned LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time: 7.721967458724976
              precision    recall  f1-score   support

           0       0.92      0.79      0.85     21838
           1       0.50      0.75      0.60      6032

    accuracy                           0.78     27870
   macro avg       0.71      0.77      0.73     27870
weighted avg       0.83      0.78      0.80     27870


[[17325  4513]
 [ 1499  4533]] Tuned LR 0.7842841765339075 



 Tuned AB
Elapsed time: 528.7824823856354
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     21838
           1       0.61      0.60      0.61      6032

    accuracy                           0.83     27870
   macro avg       0.75      0.75      0.75     27870
weighted avg       0.83      0.83      0.83     27870


[[19539  2299]
 [ 2415  3617]] Tuned AB 0.8308575529242913 



In [12]:
from sklearn.feature_selection import SelectKBest

In [13]:
%%time
#Function to do k-best feature selection on given dataset
def select_features(k=10):
    print("\n\nK = ",k)
    k_best_fs = SelectKBest(k=k).fit(X_train, y_train)
    X_train_featured = k_best_fs.transform(X_train)
    X_test_featured = k_best_fs.transform(X_test)
    print("X_train shape:",X_train.shape,"X_train_featured shape",X_train_featured.shape)
    print("X_test shape:",X_test.shape,"X_test_featured shape",X_test_featured.shape)
    return X_train_featured,y_train,X_test_featured,y_test

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 17.2 µs


In [14]:
%%time
#Testing out a range of number of features
for i in [1,5,10,20,50,100]:
    run_models(*select_features(i))



K =  1
X_train shape: (175474, 111) X_train_featured shape (175474, 1)
X_test shape: (27870, 111) X_test_featured shape (27870, 1)


 Tuned QDA
Elapsed time: 0.0582427978515625
              precision    recall  f1-score   support

           0       0.90      0.69      0.79     21838
           1       0.40      0.73      0.52      6032

    accuracy                           0.70     27870
   macro avg       0.65      0.71      0.65     27870
weighted avg       0.79      0.70      0.73     27870


[[15167  6671]
 [ 1626  4406]] Tuned QDA 0.7022963760315751 



 Tuned LR
Elapsed time: 0.31101274490356445
              precision    recall  f1-score   support

           0       0.90      0.71      0.80     21838
           1       0.41      0.71      0.52      6032

    accuracy                           0.71     27870
   macro avg       0.65      0.71      0.66     27870
weighted avg       0.79      0.71      0.74     27870


[[15586  6252]
 [ 1727  4305]] Tuned LR 0.713706494438464



              precision    recall  f1-score   support

           0       0.91      0.74      0.82     21838
           1       0.44      0.73      0.55      6032

    accuracy                           0.74     27870
   macro avg       0.67      0.73      0.68     27870
weighted avg       0.81      0.74      0.76     27870


[[16193  5645]
 [ 1639  4393]] Tuned QDA 0.738643702906351 



 Tuned LR
Elapsed time: 0.8497419357299805
              precision    recall  f1-score   support

           0       0.91      0.75      0.83     21838
           1       0.45      0.73      0.56      6032

    accuracy                           0.75     27870
   macro avg       0.68      0.74      0.69     27870
weighted avg       0.81      0.75      0.77     27870


[[16487  5351]
 [ 1623  4409]] Tuned LR 0.7497667743092932 



 Tuned AB
Elapsed time: 52.58892750740051
              precision    recall  f1-score   support

           0       0.90      0.80      0.85     21838
           1       0.48 



[[17336  4502]
 [ 1770  4262]] Tuned QDA 0.7749551489056333 



 Tuned LR
Elapsed time: 2.2748842239379883
              precision    recall  f1-score   support

           0       0.92      0.78      0.84     21838
           1       0.48      0.74      0.58      6032

    accuracy                           0.77     27870
   macro avg       0.70      0.76      0.71     27870
weighted avg       0.82      0.77      0.79     27870


[[17051  4787]
 [ 1561  4471]] Tuned LR 0.7722282023681378 



 Tuned AB
Elapsed time: 77.49974370002747
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     21838
           1       0.57      0.63      0.60      6032

    accuracy                           0.82     27870
   macro avg       0.73      0.75      0.74     27870
weighted avg       0.82      0.82      0.82     27870


[[18959  2879]
 [ 2235  3797]] Tuned AB 0.8165052027269465 



K =  20
X_train shape: (175474, 111) X_train_featured shape (175474, 



Elapsed time: 0.25484490394592285
              precision    recall  f1-score   support

           0       0.93      0.48      0.63     21838
           1       0.31      0.86      0.46      6032

    accuracy                           0.56     27870
   macro avg       0.62      0.67      0.55     27870
weighted avg       0.79      0.56      0.59     27870


[[10444 11394]
 [  831  5201]] Tuned QDA 0.5613562970936491 



 Tuned LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time: 2.388089656829834
              precision    recall  f1-score   support

           0       0.92      0.79      0.85     21838
           1       0.49      0.74      0.59      6032

    accuracy                           0.78     27870
   macro avg       0.70      0.76      0.72     27870
weighted avg       0.82      0.78      0.79     27870


[[17145  4693]
 [ 1545  4487]] Tuned LR 0.7761750986724076 



 Tuned AB
Elapsed time: 82.05676913261414
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     21838
           1       0.60      0.58      0.59      6032

    accuracy                           0.83     27870
   macro avg       0.74      0.74      0.74     27870
weighted avg       0.82      0.83      0.83     27870


[[19511  2327]
 [ 2508  3524]] Tuned AB 0.8265159669895945 



K =  50
X_train shape: (175474, 111) X_train_featured shape (175474, 50)
X_test shape: (27870, 111) X_test_featured shape (27870, 50)


 Tuned Q



Elapsed time: 0.6908657550811768
              precision    recall  f1-score   support

           0       0.89      0.47      0.61     21838
           1       0.29      0.79      0.43      6032

    accuracy                           0.54     27870
   macro avg       0.59      0.63      0.52     27870
weighted avg       0.76      0.54      0.57     27870


[[10235 11603]
 [ 1269  4763]] Tuned QDA 0.5381413706494439 



 Tuned LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time: 2.8129236698150635
              precision    recall  f1-score   support

           0       0.92      0.79      0.85     21838
           1       0.49      0.75      0.59      6032

    accuracy                           0.78     27870
   macro avg       0.70      0.77      0.72     27870
weighted avg       0.83      0.78      0.79     27870


[[17174  4664]
 [ 1535  4497]] Tuned LR 0.7775744528166487 



 Tuned AB
Elapsed time: 98.09155488014221
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     21838
           1       0.60      0.59      0.60      6032

    accuracy                           0.83     27870
   macro avg       0.75      0.74      0.74     27870
weighted avg       0.83      0.83      0.83     27870


[[19507  2331]
 [ 2484  3548]] Tuned AB 0.8272335844994618 



K =  100
X_train shape: (175474, 111) X_train_featured shape (175474, 100)
X_test shape: (27870, 111) X_test_featured shape (27870, 100)


 Tun



Elapsed time: 2.381138324737549
              precision    recall  f1-score   support

           0       0.89      0.56      0.68     21838
           1       0.32      0.75      0.45      6032

    accuracy                           0.60     27870
   macro avg       0.60      0.65      0.57     27870
weighted avg       0.77      0.60      0.63     27870


[[12153  9685]
 [ 1518  4514]] Tuned QDA 0.5980265518478651 



 Tuned LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time: 3.3500680923461914
              precision    recall  f1-score   support

           0       0.92      0.79      0.85     21838
           1       0.50      0.75      0.60      6032

    accuracy                           0.78     27870
   macro avg       0.71      0.77      0.73     27870
weighted avg       0.83      0.78      0.80     27870


[[17304  4534]
 [ 1496  4536]] Tuned LR 0.7836383207750269 



 Tuned AB
Elapsed time: 139.05390453338623
              precision    recall  f1-score   support

           0       0.89      0.90      0.89     21838
           1       0.61      0.60      0.61      6032

    accuracy                           0.83     27870
   macro avg       0.75      0.75      0.75     27870
weighted avg       0.83      0.83      0.83     27870


[[19547  2291]
 [ 2396  3636]] Tuned AB 0.8318263365626122 

CPU times: user 10min 20s, sys: 1min 5s, total: 11min 26s
Wall time: 8min 56s


In [15]:
#From the above we discover that k=10 is the best balance in-between accuracy and temporal cost

In [19]:
#Run the best configuration
run_models(*select_features(10))



K =  10
X_train shape: (175474, 111) X_train_featured shape (175474, 10)
X_test shape: (27870, 111) X_test_featured shape (27870, 10)


 Tuned QDA




Elapsed time: 0.07977771759033203
              precision    recall  f1-score   support

           0       0.91      0.79      0.85     21838
           1       0.49      0.71      0.58      6032

    accuracy                           0.77     27870
   macro avg       0.70      0.75      0.71     27870
weighted avg       0.82      0.77      0.79     27870


[[17336  4502]
 [ 1770  4262]] Tuned QDA 0.7749551489056333 



 Tuned LR
Elapsed time: 2.4172635078430176
              precision    recall  f1-score   support

           0       0.92      0.78      0.84     21838
           1       0.48      0.74      0.58      6032

    accuracy                           0.77     27870
   macro avg       0.70      0.76      0.71     27870
weighted avg       0.82      0.77      0.79     27870


[[17051  4787]
 [ 1561  4471]] Tuned LR 0.7722282023681378 



 Tuned AB
Elapsed time: 60.113999128341675
              precision    recall  f1-score   support

           0       0.89      0.87      0.8

In [20]:
data.shape #Data before encoding

(142193, 18)

In [21]:
df_encoded.shape # data after one-hot encoding

(142193, 112)