In [None]:
## Importing required packages and functions.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import subprocess
import os

In [None]:
## This data obtained from open source at: https://www.kaggle.com/datasets/zaraavagyan/weathercsv?resource=download
## I do not claim ownership of the data, all rights belong to the respective owner(s).

## Downloading the data.
subprocess.run(['kaggle', 'datasets', 'download', '-d', 'zaraavagyan/weathercsv'], check=True)
from zipfile import ZipFile
with ZipFile("weathercsv.zip", 'r') as ZIPPED:
    ZIPPED.extractall("./Assets/")

os.system(f"del weathercsv.zip")

## Loading data
weatherData = pd.read_csv("./Assets/weather.csv") ## importing data


## Dropping NA data rows.
weatherData.dropna(inplace=True)

In [None]:
# ## Firstly visualizing the data, to see what the likely key component interactions are.
# sns.heatmap(weatherData.corr(), annot=False)
weatherData.head()

In [None]:
## Identifying the categorical features.

categoricalFeatures = [column_name for column_name in weatherData.columns if weatherData[column_name].dtype == 'O']
print("Amount of Categorical Features: {}".format(len(categoricalFeatures)))
print("Categorical Features: ",categoricalFeatures)

In [None]:
## Cardinality check.
for each_feature in categoricalFeatures:
   unique_values = len(weatherData[each_feature].unique())
   print("Cardinality(no. of unique values) of {} are: {}".format(each_feature, unique_values))


In [None]:
## Dropping high cardinality columns.
weatherData.drop(['WindGustDir','WindDir9am','WindDir3pm'], axis = 1, inplace = True)
weatherData.head()

In [None]:
## Checking for any NULL values in data, should be 0 as all NA/NULL values were dropped.
categoricalFeatures = [column_name for column_name in weatherData.columns if weatherData[column_name].dtype == 'O']
weatherData[categoricalFeatures].isnull().sum()

numericalFeatures = [column_name for column_name in weatherData.columns if weatherData[column_name].dtype != 'O']
weatherData[numericalFeatures].isnull().sum()

In [None]:
## Identifying the numerical features.

numericalFeatures = [column_name for column_name in weatherData.columns if weatherData[column_name].dtype != 'O']
print("Amount of Numerical Features: {}".format(len(numericalFeatures)))
print("Numerical Features: ",numericalFeatures)

In [None]:
## Feature encoding.

weatherData['RainToday'].replace({'No':0, 'Yes': 1}, inplace = True)

weatherData['RainTomorrow'].replace({'No':0, 'Yes': 1}, inplace = True)

In [None]:
## Assessing the correlation between the various parameters.
plt.figure(figsize=(20,20))
sns.heatmap(weatherData.corr(), linewidths=0.5, annot=False, fmt=".2f", cmap = 'viridis')

In [None]:
## Quantifying feature importance.

X = weatherData.drop(['RainTomorrow'],axis=1)
y = weatherData['RainTomorrow']
etr_model = ExtraTreesRegressor()
etr_model.fit(X,y)
etr_model.feature_importances_

## Visualizing the feature importance.
feature_imp = pd.Series(etr_model.feature_importances_,index=X.columns)
feature_imp.nlargest(10).plot(kind='barh')

In [None]:
## Making training models.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
print("Length of Training Data: {}".format(len(X_train)))
print("Length of Testing Data: {}".format(len(X_test)))

# KN classifier

In [None]:
## Making KN model to include: PCA analysis, feature scaling, and the KN model itself.
KNC_Pipeline = GridSearchCV(
    make_pipeline(
        StandardScaler(),
        PCA(),
        KNeighborsClassifier(n_neighbors=10)
    ),
    {
        "pca__n_components" : range(1, 10),
    }
)

## Fitting the model and viewing the score.
KNC_Pipeline.fit(X_train, y_train)
print(f"Score for K-Neighbors Classifier is: {KNC_Pipeline.score(X_test, y_test):.3}")
print(f"From PCA analysis, the best estimator is: {KNC_Pipeline.best_estimator_['pca'].n_components_}")


In [None]:
## Testing the KN model by predicting values.
y_PredictionKNC = KNC_Pipeline.predict(X_test)

## Assessing the KN model score.
print("Accuracy Score: {:.3}".format(accuracy_score(y_test,y_PredictionKNC)))

## Visualizing with a confusion matrix.
cm = confusion_matrix(y_test, y_PredictionKNC)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
## Checking the KN model for over/under fitting.
print("Train Data Score: {}".format(KNC_Pipeline.score(X_train, y_train)))
print("Test Data Score: {}".format(KNC_Pipeline.score(X_test, y_test)))

## Checking if score can be improved for KN classifier.
from sklearn.model_selection import cross_val_score
scores = cross_val_score(KNC_Pipeline, X_train, y_train, cv = 5, scoring='accuracy')
print('Cross-validation scores:{}'.format(scores))
print('Average cross-validation score: {}'.format(scores.mean()))

In [None]:
## Saving out the trained KN classifier.

with open('./Classifiers/KNC_Rain.pkl', 'wb') as file:
    pickle.dump(KNC_Pipeline, file)

# SVC classifier

In [None]:
## Checking to see how many features should be kept.
SVC_CheckPipeline = Pipeline(
    [   
        ("scaler", MinMaxScaler()),
        ("anova", SelectPercentile(chi2)),
        ("svc", SVC(gamma="auto",random_state = 42,decision_function_shape='ovr',kernel="linear"),),
    ]
)
scoremeans = list()
scoreSTDs = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

## Looping over each percentile and checking the accuracy.
for percentile in percentiles:
    SVC_CheckPipeline.set_params(anova__percentile=percentile)
    this_scores = cross_val_score(SVC_CheckPipeline, X, y)
    scoremeans.append(this_scores.mean())
    scoreSTDs.append(this_scores.std())

## Outputting the accuracies as a graph.
plt.errorbar(percentiles, scoremeans, np.array(scoreSTDs))
plt.title("How the performance of the SVC-Anova changes, \nbased on varying the percentile of features selected")
plt.xticks(np.linspace(0, 100, 11, endpoint=True))
plt.xlabel("Percentile")
plt.ylabel("Accuracy")
plt.axis("tight")
plt.show()


In [None]:
## Taking 40% of features, as this was the highest accuracy based on the graph made in the previous step.
## Also re-making weatherData to incorporate this change.

featureSelection = VarianceThreshold(0.4)
featureSelection.fit_transform(weatherData)

X = weatherData.drop(['RainTomorrow'],axis=1)
y = weatherData['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.6, random_state = 0)

SVC_Pipeline = GridSearchCV(
    make_pipeline(
        StandardScaler(),
        PCA(),
        SVC(gamma=10,random_state = 42,decision_function_shape='ovr',kernel="linear",tol=0.1,C=0.5),
    ),
    {
        "pca__n_components" : range(1, 8),
    }
)


SVC_Pipeline.fit(X_train, y_train)
print(f"Score for SVC Classifier is: {SVC_Pipeline.score(X_test, y_test):.3}")
print(f"From PCA analysis, the best estimator is: {SVC_Pipeline.best_estimator_['pca'].n_components_}")

## Checking correct and incorrect predictions with a confusion matrix.
y_Prediction = SVC_Pipeline.predict(X_test)
confusionMatrix_SVC = confusion_matrix(y_test, y_Prediction)
cm_display = ConfusionMatrixDisplay(confusionMatrix_SVC).plot()

In [None]:
## Checking for over/under fitting.
print("Train Data Score: {}".format(SVC_Pipeline.score(X_train, y_train)))
print("Test Data Score: {}".format(SVC_Pipeline.score(X_test, y_test)))

## Checking if score can be improved for KN classifier.
from sklearn.model_selection import cross_val_score
scores = cross_val_score(SVC_Pipeline, X_train, y_train, cv = 5, scoring='accuracy')
print('Cross-validation scores:{}'.format(scores))
print('Average cross-validation score: {}'.format(scores.mean()))

In [None]:
## Saving out the trained classifier.

with open('./classifiers/SVC_Rain.pkl', 'wb') as file:
    pickle.dump(SVC_Pipeline, file)

---
# Conclusions

SVC was found to be the better model, with a score of 0.944 vs KN with a score of 0.894.
If these scores appear different to the ones you find, there is a chance that the downloaded data set has changed.

---

## Making predictions

### Dummy data set

In [None]:
## Making a fake data set
## Define column names
import random

rangeToUse = 20

columns = [
    'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 
    'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
    'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 
    'Temp9am', 'Temp3pm', 'RainToday', 'RISK_MM'
]

## Create an empty DataFrame with specified columns
fakeDf = pd.DataFrame(columns=columns)

## Defining range for each column
ranges = {
    'MinTemp': (-10, 40), 'MaxTemp': (-10, 40), 'Rainfall': (0, 10), 
    'Evaporation': (0, 12), 'Sunshine': (0, 15), 'WindGustSpeed': (0, 100), 
    'WindSpeed9am': (0, 50), 'WindSpeed3pm': (0, 50), 'Humidity9am': (70, 100), 
    'Humidity3pm': (70, 100), 'Pressure9am': (980, 1100), 'Pressure3pm': (980, 1100), 
    'Cloud9am': (0, 9), 'Cloud3pm': (0, 9), 'Temp9am': (-10, 40), 'Temp3pm': (-10, 40), 
    'RainToday': (0, 1), 'RISK_MM': (0, 100)
}

## Generate random values for each column
for col in columns:
    if col != "RainToday":
        minVal, maxVal = ranges[col]
        fakeDf[col] = [round(random.uniform(minVal, maxVal), 1) for _ in range(rangeToUse)]
    else:
        fakeDf[col] = [random.randint(0,1) for _ in range(rangeToUse)]

fakeDf.head(2)

### Actual predictions

In [None]:

## Using the SVC model:
PredictedForecastSVC = SVC_Pipeline.best_estimator_.predict(fakeDf)


## Using the KN model:
PredictedForecastKN = KNC_Pipeline.best_estimator_.predict(fakeDf)


fakeDf['PredictedForecastSVC'] = PredictedForecastSVC
fakeDf['PredictedForecastKN'] = PredictedForecastKN

fakeDf['PredictionMatch'] = False


for idx, row in fakeDf.iterrows():
    if row['PredictedForecastKN'] == row['PredictedForecastSVC']:
        fakeDf.at[idx,'PredictionMatch'] = True
    else:
        pass


fakeDf.head(20)


In [None]:
## Prediction match from each ML model
falseCount = len(fakeDf[fakeDf['PredictionMatch'] == False])
trueCount = len(fakeDf[fakeDf['PredictionMatch'] == True])
percentageMatch =(trueCount/rangeToUse) * 100
print(f"Match percent: {percentageMatch}")