***PART 1***

In [1]:
#importing required libraries
import pandas as pd
import numpy as np
from time import time
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import TomekLinks
from multiprocessing import Process
from time import time
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [2]:
#Loading the data and removing unwanted columns specified below
df = pd.read_csv('weatherAUS.csv')#,nrows=10000)
df = df.drop(df.columns[0],axis = 1)#Dropping the Dates
df = df.drop(df.columns[-2], axis = 1)#Dropping the RISK_MM column
data = df.copy()
df_2 = data.copy()

In [3]:
#Any features with more than 30% of empty data are removed
limitPer = len(df_2) * .7
df_2 = df_2.dropna(thresh=limitPer,axis=1)
df_3 = df_2.copy()

In [4]:
%%time
#Label Encoding each of the categorical types
df_2 = df_2.convert_dtypes()
df_encoded = df_2.copy()###########################
encoding_info = {}#stores the individual column's transformational models so that it can be inversed later
le = preprocessing.LabelEncoder()
for i in df_encoded.columns[df_encoded.dtypes=='string']:
    fit_by = pd.Series([j for j in df_2.loc[:,i] if isinstance(j,str)])
    le.fit(fit_by)
    encoding_info[i]=le
    df_encoded[i] = fit_by.apply(lambda x: le.transform([x])[0] if type(x) == str else x)

CPU times: user 1min 17s, sys: 465 ms, total: 1min 18s
Wall time: 1min 17s


In [5]:
%%time
#Performing imputation on missing values
imp1 = KNNImputer()
df_knn_imputed_float_columns = imp1.fit_transform(df_encoded)
df_knn_imputed_float_columns = pd.DataFrame(df_knn_imputed_float_columns, columns = list(df_2.columns))
df_new = df_knn_imputed_float_columns

##Converting the imputed continuous data back into catagorical by rounding them
for i in encoding_info.keys():
    df_new[i] = np.round(df_new[i].values,0).astype(int)

CPU times: user 9min 41s, sys: 4min 23s, total: 14min 5s
Wall time: 7min 57s


In [6]:
%%time
df_outlied = df_new.copy()
X = df_outlied
#Performing Isolation Forest algorithm for 2% outlier elimination
clf = IsolationForest(contamination = 0.02,n_jobs=-1)
clf.fit(X)
results = clf.predict(X)
outliers = X[results == -1]
normal = X[results == 1]
df_outlied = pd.DataFrame(normal,columns = list(df_outlied.columns))

CPU times: user 10.6 s, sys: 241 ms, total: 10.8 s
Wall time: 8.42 s


In [20]:
%%time
#Splitting the data for training and testing
X,y = df_outlied.iloc[:,:-1], df_outlied.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

CPU times: user 20.8 ms, sys: 0 ns, total: 20.8 ms
Wall time: 19.2 ms


In [21]:
%%time
#Performing Min Max scaling of the data
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

CPU times: user 34.5 ms, sys: 0 ns, total: 34.5 ms
Wall time: 32.3 ms


In [22]:
%%time
#Checking the imbalance ratio of 0
imbalance = y[y==0].count()
print("Imbalance ratio '0':all is",imbalance[0]/y.count()[0])

#Handling imbalance using undersampling - Tomek Links
tl = TomekLinks(n_jobs=-1)
start = time()
X_res, y_res = tl.fit_resample(X_train, y_train)
print("Elapsed time:",time()-start)
print("Before Tomek Under Sampling:",y_train.value_counts(),'\n\n')
print("After Tomek Under Sampling:",y_res.value_counts(),'\n\n')

Imbalance ratio '0':all is 0.7862632670489204
Elapsed time: 16.38613796234131
Before Tomek Under Sampling: RainTomorrow
0               87617
1               23862
dtype: int64 


After Tomek Under Sampling: RainTomorrow
0               82556
1               23862
dtype: int64 


CPU times: user 2min 54s, sys: 345 ms, total: 2min 54s
Wall time: 16.4 s


In [10]:
#function to test out the selected and tuned ML algorithms with given dataset
X_train,y_train = np.array(X_res),np.array(y_res).reshape(-1,)
X_test,y_test = np.array(X_test),np.array(y_test).reshape(-1,)
def run_models(X_train,y_train,X_test,y_test):
    QDA_best_params = {'reg_param': 0.001}
    LR_best_params = {'C': 10, 'penalty': 'l2'}
    ABC_best_params = {'learning_rate': 1.0, 'n_estimators': 500}
    Tuned_CLS = [(QuadraticDiscriminantAnalysis(**QDA_best_params),"Tuned QDA"),
                 (LogisticRegression(**LR_best_params),"Tuned LR"),
                 (AdaBoostClassifier(**ABC_best_params),"Tuned AB")]

    for clf,name in Tuned_CLS:
        start = time()
        print("\n\n",name)
        clf.fit(X_train,y_train)
        y_true, y_pred = y_test, clf.predict(X_test)
        print("Elapsed time:", time()-start)
        print(classification_report(y_true, y_pred))
        print()
        confusionmatrix = confusion_matrix(y_test,y_pred)
        score = accuracy_score(y_test,y_pred)
        print()
        print(confusionmatrix,name,score,'\n')

In [11]:
#Run ML just after Tomek links
run_models(X_train,y_train,X_test,y_test)



 Tuned QDA
Elapsed time: 0.14851927757263184
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     21948
           1       0.65      0.46      0.54      5922

    accuracy                           0.83     27870
   macro avg       0.76      0.70      0.72     27870
weighted avg       0.82      0.83      0.82     27870



[[20487  1461]
 [ 3181  2741]] Tuned QDA 0.8334409759598134 



 Tuned LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time: 2.2076144218444824
              precision    recall  f1-score   support

           0       0.87      0.94      0.90     21948
           1       0.67      0.49      0.56      5922

    accuracy                           0.84     27870
   macro avg       0.77      0.71      0.73     27870
weighted avg       0.83      0.84      0.83     27870



[[20550  1398]
 [ 3044  2878]] Tuned LR 0.8406171510584858 



 Tuned AB
Elapsed time: 98.28537440299988
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     21948
           1       0.69      0.51      0.59      5922

    accuracy                           0.85     27870
   macro avg       0.78      0.73      0.75     27870
weighted avg       0.84      0.85      0.84     27870



[[20592  1356]
 [ 2888  3034]] Tuned AB 0.8477215644061715 



In [12]:
from sklearn.feature_selection import SelectKBest

In [13]:
%%time
#Function to do k-best feature selection on given dataset
def select_features(k=10):
    print("\n\nK = ",k)
    k_best_fs = SelectKBest(k=k).fit(X_train, y_train)
    X_train_featured = k_best_fs.transform(X_train)
    X_test_featured = k_best_fs.transform(X_test)
    print("X_train shape:",X_train.shape,"X_train_featured shape",X_train_featured.shape)
    print("X_test shape:",X_test.shape,"X_test_featured shape",X_test_featured.shape)
    return X_train_featured,y_train,X_test_featured,y_test

CPU times: user 7 µs, sys: 2 µs, total: 9 µs
Wall time: 15.3 µs


In [14]:
%%time
# for i in [1,5,10,20,50,100]:
#     run_models(*select_features(i))

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.78 µs


In [15]:
#From the above we discover that k=10 is the best balance 

In [16]:
#Run the best configuration
run_models(*select_features(10))



K =  10
X_train shape: (106418, 17) X_train_featured shape (106418, 10)
X_test shape: (27870, 17) X_test_featured shape (27870, 10)


 Tuned QDA
Elapsed time: 0.20652151107788086
              precision    recall  f1-score   support

           0       0.86      0.94      0.90     21948
           1       0.67      0.45      0.54      5922

    accuracy                           0.84     27870
   macro avg       0.77      0.69      0.72     27870
weighted avg       0.82      0.84      0.82     27870



[[20626  1322]
 [ 3266  2656]] Tuned QDA 0.835378543236455 



 Tuned LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Elapsed time: 3.2776601314544678
              precision    recall  f1-score   support

           0       0.87      0.94      0.90     21948
           1       0.68      0.49      0.57      5922

    accuracy                           0.84     27870
   macro avg       0.77      0.71      0.73     27870
weighted avg       0.83      0.84      0.83     27870



[[20566  1382]
 [ 3043  2879]] Tuned LR 0.841227125941873 



 Tuned AB
Elapsed time: 105.02947926521301
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     21948
           1       0.69      0.50      0.58      5922

    accuracy                           0.85     27870
   macro avg       0.78      0.72      0.74     27870
weighted avg       0.84      0.85      0.84     27870



[[20595  1353]
 [ 2941  2981]] Tuned AB 0.8459275206315034 



In [17]:
#Own function that does robust KNN imputation for both categorical and numerical features
def knn_own_imputer(dat):
    #to convert the given data into a pandas DataFrame
    if isinstance(dat,pd.core.frame.DataFrame):
        df_2 = dat
    else:
        df_2 = pd.DataFrame(dat)
    #To print the number of empty values
    try:
        init_null = df_2.isnull().sum()
        print("NULL Value report of input:", init_null)
    except:
        pass
    #dataframes being converted in to appropriate data types
    df_2 = df_2.convert_dtypes()
    df_encoded = df_2.copy()
    encoding_info = {}#stores the individual column's transformational models so that it can be inversed later
    encoding_info_classes = {}#stores the individual column's encoding details
    #Loading the label encoder
    le = preprocessing.LabelEncoder()
    #Encoding the catagorical features excluding the empty values
    for i in df_encoded.columns[df_encoded.dtypes=='string']:
        fit_by = pd.Series([j for j in df_2.loc[:,i] if isinstance(j,str)])
        le.fit(fit_by)
        encoding_info[i]=le
        encoding_info_classes[i] = le.classes_
        df_encoded[i] = fit_by.apply(lambda x: le.transform([x])[0] if type(x) == str else x)
    #KNN Imputing the empty values
    imp1 = KNNImputer()
    df_knn_imputed_float_columns = imp1.fit_transform(df_encoded)
    df_knn_imputed_float_columns = pd.DataFrame(df_knn_imputed_float_columns, columns = list(df_2.columns))
    df_new = df_knn_imputed_float_columns
    #Function to match a value to the closest value in the list
    def closest(lst, K): 
        return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))] 
    #Converting the imputed continuous data back into catagorical by rounding them
    for i in encoding_info.keys():
        df_new[i] = np.round(df_new[i].values,0).astype(int)
    #Matching the rounded values to the closest available classes and decoding them
    for i in encoding_info_classes.keys():
        encoding_info[i].classes_ = encoding_info_classes[i]
        df_new[i] = df_new[i].apply(lambda x: encoding_info[i]\
                .inverse_transform([x])[0] if x in np.arange(len(encoding_info[i].classes_))\
                                    else closest(np.arange(len(encoding_info[i].classes_)),x))    
    
    #To print the number of empty values
    try:
        final_null = df_new.isnull().sum()
        print("\n\nNULL Value report of input:", final_null)
    except:
        pass

    return df_new

In [18]:
#Testing out our function with sample 1000 values
a = knn_own_imputer(df_3.iloc[:1000,:])

NULL Value report of input: Location           0
MinTemp            5
MaxTemp            1
Rainfall          10
WindGustDir        3
WindGustSpeed      3
WindDir9am       197
WindDir3pm        22
WindSpeed9am       3
WindSpeed3pm       3
Humidity9am        0
Humidity3pm        0
Pressure9am        0
Pressure3pm        0
Temp9am            0
Temp3pm            0
RainToday         10
RainTomorrow       0
dtype: int64


NULL Value report of input: Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64


In [19]:
a

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,No,No
1,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,No,No
2,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,No,No
3,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,No,No
4,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Albury,5.6,17.8,0.0,WSW,19.0,S,SSE,4.0,13.0,89.0,59.0,1020.5,1016.4,10.2,17.4,No,No
996,Albury,10.2,16.0,0.0,WSW,13.0,SSW,SE,9.0,4.0,87.0,80.0,1014.8,1012.6,13.0,15.0,No,Yes
997,Albury,11.1,21.4,4.2,NW,41.0,NNW,S,7.0,7.0,88.0,53.0,1013.2,1010.1,13.3,19.9,No,No
998,Albury,8.7,21.8,0.0,NW,33.0,S,NNW,4.0,24.0,79.0,50.0,1012.2,1009.5,13.5,21.3,No,No
