<div style="display: block; height:200px; overflow:hidden;position: relative">
     <img src="https://imgur.com/cSyszHt.jpg" style="position: absolute;top: -250px;">
</div>

<div style="text-align:center; font-size: 30pt; font-weight:700">Rain in Australia</div>

<div>The goal is to train a model that could predict rain tomorrow in Australia, for this analysis a simple LogisticRegression model is used.</div>
<br>
<b>
<ol >
    <li>
        <b>Imports</b>
    </li>
    <li>
        <b>Loading dataset</b>
    </li>
    <li>
        <b>Check duplicates</b>
    </li>
    <li>
        <b>Check missing values</b>
    </li>
    <li>
        <b>Coordinates</b>
    </li>
    <li>
        <b>Explore data</b>
    </li>
    <li>
        <b>Training</b>
    </li>
</ol>

# 1. Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')
#
from ipywidgets import interact
#
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from scipy import stats
#
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-white');
plt.rcParams['font.size'] = 14;
plt.figure(figsize=(12,5));
palette = sns.color_palette('Paired', 10);
#
from geopy.geocoders import Nominatim
import folium
from folium.plugins import HeatMap
import calendar
#
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn import set_config; set_config(display='diagram')

# 2. Loading dataset

In [None]:
data = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")
data.head()

# 3. Check duplicates

In [None]:
size_before = len(data)
data = data.drop_duplicates()
size_after = len(data)
print(str(size_before - size_after) + " duplicates were removed.")

# 4. Check missing values
<div style="font-weight:700">Dropping features that have too many missing values</div>

In [None]:
100 * data.isnull().sum().sort_values(ascending=False)/len(data)

<div style="font-weight:700">Let's remove features with more than 30% missing values:</div>
<br/>
<div style="font-weight:700">> <span style="color:royalblue">Sunshine</span>, <span style="color:royalblue">Evaporation</span>, <span style="color:royalblue">Cloud3pm</span>, <span style="color:royalblue">Cloud9am</span></div>

In [None]:
data = data.drop(["Sunshine", "Evaporation", "Cloud3pm", "Cloud9am"], axis=1)
data.head()

# 5. Coordinates
<br/>
<div style="font-weight:700">To retrieve correct coordinates with Nominatim, we need cities name to be correctly written</div>

In [None]:
badly_named = {"AliceSprings":"Alice Springs",
               "BadgerysCreek":"Badgerys Creek",
               "CoffsHarbour": "Coffs Harbour",
               "GoldCoast": "Gold Coast",
               "MelbourneAirport": "Melbourne Airport",
               "MountGambier": "Mount Gambier",
               "MountGinini": "Mount Ginini",
               "NorahHead": "Norah Head",
               "NorfolkIsland": "Norfolk Island",
               "PearceRAAF": "Pearce RAAF",
               "PerthAirport": "Perth Airport",
               "SalmonGums": "Salmon Gums",
               "SydneyAirport": "Sydney Airport",
               "WaggaWagga": "Wagga Wagga"}
data["Location"] = data["Location"].apply(lambda x: badly_named[x] if x in badly_named.keys() else x)

In [None]:
geolocator = Nominatim(user_agent="null")
#
latitude = []
longitude = []
#
for k in data["Location"].unique():
    search_ = k + ", Australia"
    location = geolocator.geocode(search_)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
#
latitude = dict(zip(data["Location"].unique(), latitude))
longitude = dict(zip(data["Location"].unique(), longitude))

In [None]:
data["longitude"] = data["Location"].map(longitude)
data["latitude"] = data["Location"].map(latitude)

# 6. Explore data

In [None]:
center_location = [-25.3455545, 131.036961]
m = folium.Map(location=center_location, control_scale=True, zoom_start=4)#, tiles="Stamen terrain")

In [None]:
heatmap_data = data[['latitude', 'longitude', 'Rainfall']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist()
gradient = {0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}
HeatMap(data=heatmap_data, radius=15, gradient=gradient, max_zoom=1).add_to(m)
m

In [None]:
def barplot_rainTomorrow(df):
    df.sort_values(by="Location", inplace=True)
    ax = sns.displot(data=df,x="Location", hue="RainTomorrow",multiple="stack", aspect=2.2, height=7, legend=False);
    plt.tick_params(axis='y', rotation=0, size=10, labelsize=18)
    plt.xlabel("", fontdict={"fontsize":25})
    plt.ylabel("Days", fontdict={"fontsize":25}, loc="bottom")
    plt.tick_params(axis='x', rotation=90, size=20, labelsize=20, top=False)
    plt.tick_params(axis='y', right=False)
    plt.legend(["Yes","No"], title="Rain Tomorrow", title_fontsize=20, fontsize=20, loc='right', bbox_to_anchor=(1.05, 1));
    plt.title("Days of Rain Tomorrow per city \n over the covered period", fontsize=30);

In [None]:
barplot_rainTomorrow(data)

In [None]:
data["Date"] = pd.to_datetime(data["Date"])
data["Month"] = data["Date"].apply(lambda x: x.month)

In [None]:
@interact
def plot_climate(city=sorted(list(data.Location.unique()))):
    #def plot_climate_script(data, city=city):
    df = data.copy()
    rainfall = df.groupby(["Location", "Month"])[["Rainfall"]].mean()
    max_temp = df.groupby(["Location", "Month"])[["MaxTemp"]].mean()
    max_tempMin = df.groupby(["Location", "Month"])[["MaxTemp"]].min()
    max_tempMax = df.groupby(["Location", "Month"])[["MaxTemp"]].max()
    min_temp = df.groupby(["Location", "Month"])[["MinTemp"]].mean()
    min_tempMin = df.groupby(["Location", "Month"])[["MinTemp"]].min()
    min_tempMax = df.groupby(["Location", "Month"])[["MinTemp"]].max()
    
    fig, ax1 = plt.subplots(figsize=(13,7))
    ax2 = ax1.twinx()
    ax2.plot(max_temp.unstack()["MaxTemp"].T[city], 'o-', color="red", linewidth=3, markersize=12, label="Avg max temp")
    ax2.plot(max_tempMin.unstack()["MaxTemp"].T[city], 'o-', color="orangered", linewidth=1, markersize=5, alpha=0.3)
    ax2.plot(max_tempMax.unstack()["MaxTemp"].T[city], 'o-', color="orangered", linewidth=1, markersize=5, alpha=0.3)
    ax2.fill_between(np.arange(1, 13),
                     max_tempMin.unstack()["MaxTemp"].T[city],
                     max_tempMax.unstack()["MaxTemp"].T[city],
                     color="orangered",
                     alpha=0.1,
                     label="Max temp range")
    ax2.plot(min_temp.unstack()["MinTemp"].T[city], 'o-', color="blue", linewidth=3, markersize=12, label="Avg min temp")
    ax2.plot(min_tempMin.unstack()["MinTemp"].T[city], 'o-', color="darkblue", linewidth=1, markersize=5, alpha=0.3)
    ax2.plot(min_tempMax.unstack()["MinTemp"].T[city], 'o-', color="darkblue", linewidth=1, markersize=5, alpha=0.3)
    ax2.fill_between(np.arange(1, 13),
                     min_tempMin.unstack()["MinTemp"].T[city],
                     min_tempMax.unstack()["MinTemp"].T[city],
                     color="darkblue",
                     alpha=0.1,
                     label="Min temp range")
    ax2.tick_params(axis='y', size=5, labelsize=13)
    ax2.set_ylabel('Temperature [°C]', fontsize=15, labelpad=15)
    ax2.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.spines["top"].set_visible(False)
    ax2.spines["right"].set_visible(False)
    ax2.spines["left"].set_visible(False)
    
    ax1.bar(x= np.arange(1, 13), height=rainfall.unstack()["Rainfall"].T[city], label="Avg monthly rainfall")
    plt.xticks(list(rainfall.unstack()["Rainfall"].columns),
               list(pd.Series(rainfall.unstack()["Rainfall"].columns).apply(lambda x: calendar.month_name[x])))
    ax1.tick_params(axis='x', rotation=45, size=13, labelsize=13, top=False)
    ax1.tick_params(axis='y', size=5, labelsize=13)
    ax1.set_ylabel('Average monthly rainfall [mm]', fontsize=15, labelpad=15)
    ax1.set_title(f"{city}, South Australia", pad=20, fontdict={"fontsize":30, "color":"black"})
    ax1.spines["top"].set_visible(False)
    ax1.spines["right"].set_visible(False)
    ax1.spines["left"].set_visible(False)
    ax1.legend(title='', bbox_to_anchor=(1.05, 1.0), loc='lower left')
    plt.tight_layout()

In [None]:
data.head()

# 7. Training

## 7.1. Clean data
<div style="font-weight:700">Let's first remove features we had for exploration and that won't be usefull then</div> <ul><li><span style="color:royalblue">latitude</span>, <span style="color:royalblue">longitude</span>: because we have the feature <span style="color:royalblue">Location</span></li><li><span style="color:royalblue">Date</span>: because we have the feature <span style="color:royalblue">Month</span></li></ul>

In [None]:
data = data.drop(["Date", "longitude", "latitude"], axis=1).reset_index(drop=True)

In [None]:
def clean_data(df):
    df = df[(~df["RainTomorrow"].isnull())]
    df = df[(~df["RainToday"].isnull())]
    df["Month"] = df["Month"].apply(lambda x: calendar.month_name[x])
    df["RainTomorrow"] = df["RainTomorrow"].apply(lambda x: 1 if x=="Yes" else 0)
    df["RainToday"] = df["RainToday"].apply(lambda x: 1 if x=="Yes" else 0)
    return df
#
data = clean_data(data)

In [None]:
data["MinTemp"].max()

## 7.2. Model inputs

In [None]:
X_train = data.drop(["RainTomorrow"], axis=1).copy()
y_train = data["RainTomorrow"].copy()

In [None]:
X_train.head()

## 7.3. Pipelines

### 7.3.1. Pipeline for numeric features
<div style="font-weight:700">Let's define <span style="color:green; font-variant:small-caps">mean</span> as the default impute strategy</div>

In [None]:
pipe_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean'))
])
pipe_numeric

### 7.3.2. Pipeline for binary features

In [None]:
pipe_binary = Pipeline([
    ('encoder', OneHotEncoder(sparse=False, drop='if_binary'))
])
pipe_binary

### 7.3.3. Pipeline for multiclass features
<div style="font-weight:700">Let's define <span style="color:green; font-variant:small-caps">most frequent</span> as the default impute strategy</div>

In [None]:
pipe_multiclass = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder(sparse=False, handle_unknown="ignore"))
])
pipe_multiclass

### 7.3.4. Impute and encode pipeline combination

In [None]:
impute_and_encode = ColumnTransformer([
    ('numeric', pipe_numeric, make_column_selector(dtype_include="float64")),
    ('binary', pipe_binary, make_column_selector(dtype_include="int64")),
    ('multiclass', pipe_multiclass, make_column_selector(dtype_include="object"))])
impute_and_encode

### 7.3.5. Preprocessor pipeline
<div style="font-weight:700">Let's define <span style="color:green; font-variant:small-caps">StandardScaler()</span> as the default scaler</div>

In [None]:
preprocessor = Pipeline([("preproc", impute_and_encode), 
                         ("scaler", StandardScaler())])

<div style="font-weight:700">Overview of the preprocessor pipeline output:</div>

In [None]:
preprocessor.fit(X_train)

<div style="font-weight:700">The output columns will be in the order of appearance in the ColumnTranformer pipe:</div>

In [None]:
def def_col_names(a, colnames):
    col_names = []
    for k in colnames:
        col_names.append(a + "_" + str(k))
    return col_names

In [None]:
output_pipe_columns = list(X_train[make_column_selector(dtype_include="float64")].columns)
#
output_pipe_columns += list(X_train[make_column_selector(dtype_include="int64")].columns)
#
for k in list(X_train[make_column_selector(dtype_include="object")].columns):
    output_pipe_columns += def_col_names(k, X_train[k].dropna().unique())

In [None]:
X_train_scaled = pd.DataFrame(preprocessor.fit_transform(X_train), columns=output_pipe_columns)
X_train_scaled.head()

### 7.3.6. Final pipeline
<div style="font-weight:700">Let's add a baseline model - <span style="color:green; font-variant:small-caps">LogisticRegression()</span> - to the pipeline as the default model</div>

In [None]:
final_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ('classifier', LogisticRegression(max_iter=10000))])
final_pipe

## 7.4. Baseline performance with a LogisticRegression classifier

### 7.4.1. Baseline accuracy score

In [None]:
cv_baseline_LR = cross_validate(final_pipe, X_train, y_train, scoring= "recall", cv=10)
print("Baseline recall score for LogisticRegression: " + str(round(cv_baseline_LR["test_score"].mean()*100, 2)) + "%")

### 7.4.2. Baseline confusion matrix

In [None]:
prediction_dataFrame = pd.DataFrame(y_train).copy()
prediction_dataFrame['predictions'] = cross_val_predict(estimator=final_pipe,
                                                        X=X_train,
                                                        y=y_train,
                                                        cv=10)

In [None]:
prediction_dataFrame.columns = ["y_true", "y_pred"]
prediction_dataFrame.head()

In [None]:
fig = plt.figure(figsize=(18,6))
gs = fig.add_gridspec(1,2)
ax1 = fig.add_subplot(gs[0, 0])
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(prediction_dataFrame["y_true"], prediction_dataFrame["y_pred"]))
disp.plot(cmap="Blues", ax=ax1);
ax2 = fig.add_subplot(gs[0, 1])
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
#
ax2.text(0.1, 0.6, 'Accuracy = '+ str(round(accuracy_score(prediction_dataFrame["y_true"],
                                                           prediction_dataFrame["y_pred"])*100, 1)) + "%",
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.4, 'Recall = '+ str(round(recall_score(prediction_dataFrame["y_true"], 
                                                       prediction_dataFrame["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.2, 'Precision = '+ str(round(precision_score(prediction_dataFrame["y_true"], 
                                                             prediction_dataFrame["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});

## 7.5. Model optimisation

### 7.5.1. Pearson Correlation
<div style="font-weight:700">Let's see which features are redundants</div>

In [None]:
X_train_scaled = pd.DataFrame(preprocessor.fit_transform(X_train), columns=output_pipe_columns)
X_train_scaled.head()

In [None]:
corr_df = X_train_scaled.corr().unstack().reset_index()
corr_df.columns = ['feature_1','feature_2', 'correlation']
corr_df.sort_values(by="correlation",ascending=False, inplace=True)
corr_df = corr_df[corr_df['feature_1'] != corr_df['feature_2']]
corr_df = corr_df.reset_index(drop=True)
corr_df.head()

<div style="font-weight:700">We remove features which have a high correlation coefficient ( above 0.9 or below -0.9 )</div>

In [None]:
high_correlation_coeff = corr_df[(corr_df["correlation"]>0.9) | (corr_df["correlation"]<-0.9)]
high_correlation_coeff

In [None]:
X_train.drop(["Temp3pm"], axis=1, inplace=True)
X_train.drop(["Pressure3pm"], axis=1, inplace=True)

### 7.5.2. Randomized search cv for an optimized model
<div style="font-weight:700">We want to see if we can increase the recall score to capture more of the minority class, that is to say days with rain tomorrow</div>

#### 7.5.2.1. Baseline model
<div style="font-weight:700">What is the model providing the best recall score with its default parameters ?</div>

In [None]:
grid_model = {'classifier': [RandomForestClassifier(), 
                             LogisticRegression(max_iter=10000), 
                             GradientBoostingClassifier(),
                             DecisionTreeClassifier(),
                             LinearSVC()],
              "preprocessor__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()]
             }

search_model = GridSearchCV(final_pipe,
                            grid_model,
                            scoring="recall",
                            cv=5,
                            n_jobs=-1,
                            verbose=1)
search_model.fit(X_train, y_train);

In [None]:
search_model.best_params_

In [None]:
search_model.best_score_

In [None]:
pred_best_model = pd.DataFrame(y_train).copy()
pred_best_model['predictions'] = cross_val_predict(estimator=search_model.best_estimator_,
                                                   X=X_train,
                                                   y=y_train,
                                                   cv=5)
pred_best_model.columns = ["y_true", "y_pred"]

In [None]:
fig = plt.figure(figsize=(18,6))
gs = fig.add_gridspec(1,2)
ax1 = fig.add_subplot(gs[0, 0])
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(pred_best_model["y_true"], pred_best_model["y_pred"]))
disp.plot(cmap="Blues", ax=ax1);
ax2 = fig.add_subplot(gs[0, 1])
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
#
ax2.text(0.1, 0.6, 'Accuracy = '+ str(round(accuracy_score(pred_best_model["y_true"],
                                                           pred_best_model["y_pred"])*100, 1)) + "%",
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.4, 'Recall = '+ str(round(recall_score(pred_best_model["y_true"], 
                                                       pred_best_model["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.2, 'Precision = '+ str(round(precision_score(pred_best_model["y_true"], 
                                                             pred_best_model["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});

#### 7.5.2.2. Best parameters for the best baseline model: DecisionTreeClassifier()
<div style="font-weight:700">Now what are the best parameters for DecisionTreeClassifier():</div>

In [None]:
grid_DTC = {'classifier': [DecisionTreeClassifier()],
            'classifier__criterion': ["gini", "entropy"],
            'classifier__splitter': ["best", "random"],
            'classifier__max_depth': stats.randint(1, 300),
            'classifier__min_samples_split': stats.randint(2, 30),
            #'classifier__class_weight': ["balanced", "none"],
            "preprocessor__scaler": [RobustScaler()]
            }

search_DTC = RandomizedSearchCV(final_pipe,
                                grid_DTC,
                                scoring="recall",
                                n_iter=20,
                                cv=5,
                                n_jobs=-1,
                                verbose=True)
search_DTC.fit(X_train, y_train);

In [None]:
search_DTC.best_params_

In [None]:
search_DTC.best_score_

In [None]:
pred_dtc_opt = pd.DataFrame(y_train).copy()
pred_dtc_opt['predictions'] = cross_val_predict(estimator=search_DTC.best_estimator_,
                                                X=X_train,
                                                y=y_train,
                                                cv=5)
pred_dtc_opt.columns = ["y_true", "y_pred"]

In [None]:
fig = plt.figure(figsize=(18,6))
gs = fig.add_gridspec(1,2)
ax1 = fig.add_subplot(gs[0, 0])
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(pred_dtc_opt["y_true"], pred_dtc_opt["y_pred"]))
disp.plot(cmap="Blues", ax=ax1);
ax2 = fig.add_subplot(gs[0, 1])
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
#
ax2.text(0.1, 0.6, 'Accuracy = '+ str(round(accuracy_score(pred_dtc_opt["y_true"],
                                                           pred_dtc_opt["y_pred"])*100, 1)) + "%",
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.4, 'Recall = '+ str(round(recall_score(pred_dtc_opt["y_true"], 
                                                       pred_dtc_opt["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.2, 'Precision = '+ str(round(precision_score(pred_dtc_opt["y_true"], 
                                                             pred_dtc_opt["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});

<div style="font-weight:700">Let's see what is happening if we use the class_weight = balanced parameter</div>

In [None]:
grid_DTC_opt = {'classifier': [DecisionTreeClassifier()],
                'classifier__criterion': ["gini", "entropy"],
                'classifier__splitter': ["best", "random"],
                'classifier__max_depth': stats.randint(1, 300),
                'classifier__min_samples_split': stats.randint(2, 30),
                'classifier__class_weight': ["balanced"],
                "preprocessor__scaler": [RobustScaler()]
                }

search_DTC_opt = RandomizedSearchCV(final_pipe,
                                    grid_DTC_opt,
                                    scoring="recall",
                                    n_iter=20,
                                    cv=5,
                                    n_jobs=-1,
                                    verbose=True)
search_DTC_opt.fit(X_train, y_train);

In [None]:
search_DTC_opt.best_params_

In [None]:
search_DTC_opt.best_score_

In [None]:
pred_dtc_opt_balanced = pd.DataFrame(y_train).copy()
pred_dtc_opt_balanced['predictions'] = cross_val_predict(estimator=search_DTC_opt.best_estimator_,
                                                         X=X_train,
                                                         y=y_train,
                                                         cv=5)
pred_dtc_opt_balanced.columns = ["y_true", "y_pred"]

In [None]:
fig = plt.figure(figsize=(18,6))
gs = fig.add_gridspec(1,2)
ax1 = fig.add_subplot(gs[0, 0])
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(pred_dtc_opt_balanced["y_true"], pred_dtc_opt_balanced["y_pred"]))
disp.plot(cmap="Blues", ax=ax1);
ax2 = fig.add_subplot(gs[0, 1])
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
#
ax2.text(0.1, 0.6, 'Accuracy = '+ str(round(accuracy_score(pred_dtc_opt_balanced["y_true"],
                                                           pred_dtc_opt_balanced["y_pred"])*100, 1)) + "%",
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.4, 'Recall = '+ str(round(recall_score(pred_dtc_opt_balanced["y_true"], 
                                                       pred_dtc_opt_balanced["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.2, 'Precision = '+ str(round(precision_score(pred_dtc_opt_balanced["y_true"], 
                                                             pred_dtc_opt_balanced["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});

<div style="font-weight:700">Loosing a bit of accuracy but greatly increasing the recall score.</div>

### 7.5.3. Model simplification
<div style="font-weight:700">We remove features with low prediction score from permutation importance</div>

In [None]:
log_model = search_DTC_opt.best_estimator_.fit(X_train, y_train)

In [None]:
permutation_score = permutation_importance(log_model, X_train, y_train, n_repeats=50)

In [None]:
importance_df = pd.DataFrame(np.vstack((X_train.columns, permutation_score.importances_mean)).T)

In [None]:
importance_df.columns=['feature','score decrease']

In [None]:
importance_df.sort_values(by="score decrease", ascending = False)

In [None]:
X_reduced = X_train[["Humidity3pm", "WindGustSpeed", "Location", "Pressure9am", "MinTemp"]]

In [None]:
cv_reduced_DTC = cross_validate(search_DTC_opt.best_estimator_, X_reduced, y_train, scoring= "recall", cv=2)
print("Reduced recall score for RandomForestClassifier: " + str(round(cv_reduced_DTC["test_score"].mean()*100, 2)) + "%")

In [None]:
pred_dtc_opt_reduced = pd.DataFrame(y_train).copy()
pred_dtc_opt_reduced['predictions'] = cross_val_predict(estimator=search_DTC_opt.best_estimator_,
                                                        X=X_reduced,
                                                        y=y_train,
                                                        cv=5)
pred_dtc_opt_reduced.columns = ["y_true", "y_pred"]

In [None]:
fig = plt.figure(figsize=(18,6))
gs = fig.add_gridspec(1,2)
ax1 = fig.add_subplot(gs[0, 0])
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(pred_dtc_opt_reduced["y_true"], pred_dtc_opt_reduced["y_pred"]))
disp.plot(cmap="Blues", ax=ax1);
ax2 = fig.add_subplot(gs[0, 1])
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
#
ax2.text(0.1, 0.6, 'Accuracy = '+ str(round(accuracy_score(pred_dtc_opt_reduced["y_true"],
                                                           pred_dtc_opt_reduced["y_pred"])*100, 1)) + "%",
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.4, 'Recall = '+ str(round(recall_score(pred_dtc_opt_reduced["y_true"], 
                                                       pred_dtc_opt_reduced["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});
#
ax2.text(0.1, 0.2, 'Precision = '+ str(round(precision_score(pred_dtc_opt_reduced["y_true"], 
                                                             pred_dtc_opt_reduced["y_pred"])*100, 1)) + "%", 
         fontdict= {"fontsize":30});

### 7.5.4. Undersampling Majority target class / Oversampling Minority target class

In [None]:
X_train_resampled = preprocessor.fit_transform(X_train)
pd.DataFrame(X_train_resampled).head()

In [None]:
''' Imbalanced Classes'''
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imPipe

In [None]:
# Sampling strategies
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.7)

In [None]:
# Pipelining the two strategies
steps =  [('o', over), ('u', under)]
sampling_pipe = imPipe(steps=steps)


# Rebalance the dataset
X_resampled, y_resampled = sampling_pipe.fit_resample(X_train_resampled, y_train)

In [None]:
# plotting
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(1, 2, figsize=(18,4))

''' Before rebalancing classes'''

sns.countplot(y_train, ax=ax[0], color='b')
ax[0].set_title('Y before balancing', fontsize=14)

''' After rebalancing classes'''

sns.countplot(y_resampled, ax=ax[1], color='b')
ax[1].set_title('Y after balancing', fontsize=14);

In [None]:
resampled_classifier = DecisionTreeClassifier(class_weight='balanced',
                                              max_depth=19,
                                              min_samples_split=7, 
                                              splitter='random', 
                                              criterion="gini")

In [None]:
cv_resampled = cross_validate(resampled_classifier, X_resampled, y_resampled, scoring= "recall", cv=10)
print("Resampled dataset recall score for DTC: " + str(round(cv_resampled["test_score"].mean()*100, 2)) + "%")

In [None]:
cv_resampled = cross_validate(LogisticRegression(class_weight="balanced", max_iter=10000), X_resampled, y_resampled, scoring= "recall", cv=10)
print("Resampled dataset recall score for DTC: " + str(round(cv_resampled["test_score"].mean()*100, 2)) + "%")