DATA COLLECTION

In [None]:
#Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score, jaccard_score, f1_score, log_loss, mean_absolute_error, mean_squared_error, r2_score

In [None]:
#Load the Dataset
rainfall_dataset = pd.read_csv('/content/Australian_Rainfall.csv')

In [None]:
rainfall_dataset.head()

Unnamed: 0,row ID,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Row0,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0
1,Row1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0
2,Row2,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0
3,Row3,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,0
4,Row4,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No,0


DATA ANALYSIS

In [None]:
#Check for missing values
rainfall_dataset.isnull().sum()

row ID               0
Location             0
MinTemp            443
MaxTemp            230
Rainfall           979
Evaporation      42531
Sunshine         47317
WindGustDir       6521
WindGustSpeed     6480
WindDir9am        7006
WindDir3pm        2648
WindSpeed9am       935
WindSpeed3pm      1835
Humidity9am       1233
Humidity3pm       2506
Pressure9am       9748
Pressure3pm       9736
Cloud9am         37572
Cloud3pm         40002
Temp9am            614
Temp3pm           1904
RainToday          979
RainTomorrow         0
dtype: int64

In [None]:
#Clean the dataset
df_cleaned = rainfall_dataset.dropna()
df_cleaned.to_csv('cleaned_dataset.csv', index=False)
new_rainfall_dataset = pd.read_csv('cleaned_dataset.csv')
new_rainfall_dataset.isnull().sum()

row ID           0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [None]:
#Removing columns containing "String" attributes
df_modified = new_rainfall_dataset.drop(columns=['row ID', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])
df_modified.to_csv('modified_dataset.csv', index=False)
new_rainfall_dataset = pd.read_csv('modified_dataset.csv')
new_rainfall_dataset.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,0
1,27.1,36.1,0.0,13.0,0.0,43.0,7.0,20.0,26.0,19.0,1007.7,1007.4,8.0,8.0,30.7,34.3,No,0
2,23.3,34.0,0.0,9.8,12.6,41.0,17.0,19.0,33.0,15.0,1011.3,1009.9,3.0,1.0,25.0,31.5,No,0
3,16.1,34.2,0.0,14.6,13.2,37.0,15.0,6.0,25.0,9.0,1013.3,1009.2,1.0,1.0,20.7,32.8,No,0
4,19.0,35.5,0.0,12.0,12.3,48.0,30.0,9.0,46.0,28.0,1008.3,1004.0,1.0,5.0,23.4,33.3,No,0


In [None]:
#Label Binarization i.e., converting Yes/No into 1/0
df_modified['RainToday'] = new_rainfall_dataset['RainToday'].replace({'Yes': 1, 'No': 0})
df_modified.to_csv('modified_dataset.csv', index=False)
new_rainfall_dataset = pd.read_csv('modified_dataset.csv')
new_rainfall_dataset.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
1,27.1,36.1,0.0,13.0,0.0,43.0,7.0,20.0,26.0,19.0,1007.7,1007.4,8.0,8.0,30.7,34.3,0,0
2,23.3,34.0,0.0,9.8,12.6,41.0,17.0,19.0,33.0,15.0,1011.3,1009.9,3.0,1.0,25.0,31.5,0,0
3,16.1,34.2,0.0,14.6,13.2,37.0,15.0,6.0,25.0,9.0,1013.3,1009.2,1.0,1.0,20.7,32.8,0,0
4,19.0,35.5,0.0,12.0,12.3,48.0,30.0,9.0,46.0,28.0,1008.3,1004.0,1.0,5.0,23.4,33.3,0,0


In [None]:
#Statistical Analysis
new_rainfall_dataset.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
count,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0,39574.0
mean,13.461414,24.211985,2.163137,5.495004,7.723311,40.891115,15.689518,19.79436,65.870622,49.565548,1017.265985,1014.821302,4.256052,4.343812,18.196958,22.705092,0.222292,0.222419
std,6.389018,6.96598,7.02519,3.697796,3.757568,13.3164,8.327292,8.531193,18.553322,20.185655,6.929267,6.894293,2.798105,2.644123,6.549293,6.829223,0.415792,0.415876
min,-6.7,7.0,0.0,0.0,0.0,11.0,2.0,2.0,0.0,0.0,980.5,978.2,0.0,0.0,-0.7,4.3,0.0,0.0
25%,8.6,18.7,0.0,2.8,5.0,31.0,9.0,13.0,55.0,35.0,1012.7,1010.1,1.0,2.0,13.1,17.4,0.0,0.0
50%,13.2,23.9,0.0,5.0,8.6,39.0,15.0,19.0,67.0,50.0,1017.2,1014.7,5.0,5.0,17.7,22.4,0.0,0.0
75%,18.4,29.7,0.6,7.4,10.7,48.0,20.0,26.0,79.0,63.0,1021.9,1019.5,7.0,7.0,23.3,27.9,0.0,0.0
max,31.4,48.1,183.0,81.2,14.5,124.0,67.0,76.0,100.0,100.0,1040.2,1037.3,8.0,9.0,39.0,46.1,1.0,1.0


In [None]:
#Check for Target value distribution
new_rainfall_dataset['RainTomorrow'].value_counts()

RainTomorrow
0    30772
1     8802
Name: count, dtype: int64

DATA PREPROCESSING

In [None]:
#Separating "RainTomorrow" column from other columns
X = new_rainfall_dataset.drop(columns=['RainTomorrow'])
Y = new_rainfall_dataset['RainTomorrow']

In [None]:
print(X)

       MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
0         17.9     35.2       0.0         12.0      12.3           48.0   
1         27.1     36.1       0.0         13.0       0.0           43.0   
2         23.3     34.0       0.0          9.8      12.6           41.0   
3         16.1     34.2       0.0         14.6      13.2           37.0   
4         19.0     35.5       0.0         12.0      12.3           48.0   
...        ...      ...       ...          ...       ...            ...   
39569     21.9     33.0       0.0          5.2      10.9           44.0   
39570     19.3     33.4       0.0          6.0      11.0           35.0   
39571     21.2     32.6       0.0          7.6       8.6           37.0   
39572     20.7     32.8       0.0          5.6      11.0           33.0   
39573     20.2     31.7       0.0          5.6      10.7           30.0   

       WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  \
0               6.0    

In [None]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
39569    0
39570    0
39571    0
39572    0
39573    0
Name: RainTomorrow, Length: 39574, dtype: int64


TRAIN/TEST SPLITTING

In [None]:
#Split the Dataset into four variables i.e., X,Y Train and X,Y Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

TRAINING ALL THE MODELS AND EVALUATING THEM ONE BY ONE

LINEAR REGRESSION

In [None]:
linr_model = LinearRegression()

In [None]:
linr_model.fit(X_train, Y_train)

In [None]:
testing_data_prediction = linr_model.predict(X_test)

In [None]:
#R Squared value
r2_test = metrics.r2_score(Y_test, testing_data_prediction)
print(f"R Squared value: {r2_test}")

R Squared value: 0.35382951941765484


In [None]:
#Mean Absolute Error value
mae_linr = mean_absolute_error(Y_test, testing_data_prediction)
print(f"Mean Absolute Error: {mae_linr}")

Mean Absolute Error: 0.24817768058881004


In [None]:
#Mean Squared Error value
mse_linr = mean_squared_error(Y_test, testing_data_prediction)
print(f"Mean Squared Error: {mse_linr}")

Mean Squared Error: 0.1111886999361474


K-NEAREST NEIGHBOURS

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn_model.fit(X_train, Y_train)

In [None]:
testing_data_prediction = knn_model.predict(X_test)

In [None]:
#R Squared value
r2_knn = r2_score(Y_test, testing_data_prediction)
print(f"R2 Score: {r2_knn}")

R2 Score: 0.2964552785012423


In [None]:
#Mean Absolute Error value
mae_knn = mean_absolute_error(Y_test, testing_data_prediction)
print(f"Mean Absolute Error: {mae_knn}")

Mean Absolute Error: 0.2054579911560329


In [None]:
#Mean Squared Error value
mse_knn = mean_absolute_error(Y_test, testing_data_prediction)
print(f"Mean Absolute Error: {mae_knn}")

Mean Absolute Error: 0.2054579911560329


DECISION TREES

In [None]:
dt_model = DecisionTreeRegressor(random_state=42)

In [None]:
dt_model.fit(X_train, Y_train)

In [None]:
testing_data_prediction = dt_model.predict(X_test)

In [None]:
#R Squared value
r2_tree = r2_score(Y_test, testing_data_prediction)
print(f"R2 Score: {r2_tree}")

R2 Score: -0.22690798332751383


In [None]:
#Mean Absolute Error value
mae_tree = mean_absolute_error(Y_test, testing_data_prediction)
print(f"Mean Absolute Error: {mae_tree}")

Mean Absolute Error: 0.2111181301326595


In [None]:
#Mean Squared Error value
mse_tree = mean_squared_error(Y_test, testing_data_prediction)
print(f"Mean Squared Error: {mse_tree}")

Mean Squared Error: 0.2111181301326595


LOGISTIC REGRESSION

In [None]:
logr_model = LogisticRegression()

In [None]:
logr_model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Accuracy scores of Testing Data
X_test_prediction = logr_model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(testing_data_accuracy)

0.8453569172457359


In [None]:
#Jaccard Index
jaccard = jaccard_score(Y_test, testing_data_prediction)
print(f"Jaccard Index: {jaccard}")

Jaccard Index: 0.36608497723823974


In [None]:
#F1-Score
f1 = f1_score(Y_test, testing_data_prediction)
print(f"F1-Score: {f1}")

F1-Score: 0.5359622327131354


In [None]:
#Log Loss
logloss = log_loss(Y_test, testing_data_prediction)
print(f"Log Loss: {logloss}")

Log Loss: 7.609468706660109


SUPPORT VECTOR MACHINE

In [None]:
svm_model = svm.SVC(kernel='linear')

In [None]:
svm_model.fit(X_train, Y_train)

In [None]:
#Accuracy scores of Testing Data
X_test_prediction = svm_model.predict(X_test)
testing_data_accuracy_svm = accuracy_score(X_test_prediction, Y_test)
print(f"Testing Accuracy: {testing_data_accuracy_svm}")

Testing Accuracy: 0.8478837650031585


In [None]:
#Jaccard Index
jaccard = jaccard_score(Y_test, testing_data_prediction)
print(f"Jaccard Index: {jaccard}")

Jaccard Index: 0.36608497723823974


In [None]:
#F1-Score
f1 = f1_score(Y_test, testing_data_prediction)
print(f"F1-Score: {f1}")

F1-Score: 0.5359622327131354


CREATE A FINAL REPORT ON ALL THE EVALUATION METRICS

In [None]:
metrics_dict = {
    'Model': ['Linear Regression', 'K-Nearest Neighbors', 'Decision Trees', 'Logistic Regression', 'Support Vector Machine'],
    'Accuracy': [None, None, None, testing_data_accuracy, testing_data_accuracy],
    'R2-Score': [r2_test, r2_knn, r2_tree, None, None],
    'Mean Absolute Error': [mae_linr, mae_knn, mae_tree, None, None],
    'Mean Squared Error': [mse_linr, mse_knn, mse_tree, None, None],
    'Jaccard Index': [None, None, None, jaccard, jaccard],
    'F1-Score': [None, None, None, f1, f1],
    'Log Loss': [None, None, None, logloss, None]
}

# Convert the dictionary into a DataFrame
metrics_df = pd.DataFrame(metrics_dict)

# Display the DataFrame
print(metrics_df)

                    Model  Accuracy  R2-Score  Mean Absolute Error  \
0       Linear Regression       NaN  0.353830             0.205458   
1     K-Nearest Neighbors       NaN  0.296455             0.205458   
2          Decision Trees       NaN -0.226908             0.211118   
3     Logistic Regression  0.845357       NaN                  NaN   
4  Support Vector Machine  0.845357       NaN                  NaN   

   Mean Squared Error  Jaccard Index  F1-Score  Log Loss  
0            0.121061            NaN       NaN       NaN  
1            0.205458            NaN       NaN       NaN  
2            0.211118            NaN       NaN       NaN  
3                 NaN       0.366085  0.535962  7.609469  
4                 NaN       0.366085  0.535962       NaN  
