In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import GridSearchCV

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('SriLanka_Weather_Dataset.csv')
df.head()

Unnamed: 0,time,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,...,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration,latitude,longitude,elevation,country,city
0,2010-01-01,2,30.0,22.7,26.1,34.4,25.2,29.2,2010-01-01T00:52,2010-01-01T12:35,...,0.0,11.7,27.4,20,4.58,7.0,79.899994,16.0,Sri Lanka,Colombo
1,2010-01-02,51,29.9,23.5,26.2,33.8,26.2,29.8,2010-01-02T00:52,2010-01-02T12:36,...,1.0,13.0,27.0,24,3.84,7.0,79.899994,16.0,Sri Lanka,Colombo
2,2010-01-03,51,29.5,23.2,26.0,34.3,26.3,29.9,2010-01-03T00:53,2010-01-03T12:36,...,3.0,12.3,27.4,16,3.65,7.0,79.899994,16.0,Sri Lanka,Colombo
3,2010-01-04,2,28.9,21.9,25.3,31.6,23.4,27.8,2010-01-04T00:53,2010-01-04T12:37,...,0.0,17.0,34.6,356,3.79,7.0,79.899994,16.0,Sri Lanka,Colombo
4,2010-01-05,1,28.1,21.3,24.5,30.1,23.1,26.1,2010-01-05T00:53,2010-01-05T12:37,...,0.0,18.7,37.1,355,4.97,7.0,79.899994,16.0,Sri Lanka,Colombo


## Data Preprocessing

In [None]:
df.isnull().sum()

time                          0
weathercode                   0
temperature_2m_max            0
temperature_2m_min            0
temperature_2m_mean           0
apparent_temperature_max      0
apparent_temperature_min      0
apparent_temperature_mean     0
sunrise                       0
sunset                        0
shortwave_radiation_sum       0
precipitation_sum             0
rain_sum                      0
snowfall_sum                  0
precipitation_hours           0
windspeed_10m_max             0
windgusts_10m_max             0
winddirection_10m_dominant    0
et0_fao_evapotranspiration    0
latitude                      0
longitude                     0
elevation                     0
country                       0
city                          0
dtype: int64

In [None]:
# parse the dates, currently coded as strings, into datetime format
df['time'] = pd.to_datetime(df['time'])
# extract month from date
df['Month'] = df['time'].dt.month

df['Month'] = df['Month'].astype(str)

# Print the updated DataFrame
print(df['Month'].dtypes)
print(df['Month'].value_counts())

object
Month
1     13020
3     13020
5     13020
4     12600
6     12210
7     12090
8     12090
10    12090
12    12090
2     11850
9     11700
11    11700
Name: count, dtype: int64


In [None]:
# Function to recode wind directions
def recode_wind_direction(degrees):
    if (337.5 <= degrees < 360) or (0 <= degrees < 22.5):
        return "North"
    elif 22.5 <= degrees < 67.5:
        return "Northeast"
    elif 67.5 <= degrees < 112.5:
        return "East"
    elif 112.5 <= degrees < 157.5:
        return "Southeast"
    elif 157.5 <= degrees < 202.5:
        return "South"
    elif 202.5 <= degrees < 247.5:
        return "Southwest"
    elif 247.5 <= degrees < 292.5:
        return "West"
    else:
        return "Northwest"

# Apply the function to create the new column
df['Wind_direction'] = df['winddirection_10m_dominant'].apply(recode_wind_direction)

# Print the updated DataFrame
print(df['Wind_direction'].dtypes)
print(df['Wind_direction'].value_counts())

object
Wind_direction
Southwest    48483
West         28613
Northeast    20907
South        16019
North        13780
East          8915
Northwest     5986
Southeast     4777
Name: count, dtype: int64


In [None]:
df.drop(['time','temperature_2m_mean','apparent_temperature_max','apparent_temperature_min','apparent_temperature_mean','sunrise','sunset','precipitation_sum','snowfall_sum','winddirection_10m_dominant','et0_fao_evapotranspiration','latitude','longitude','elevation','country'], axis=1, inplace = True)

In [None]:
df.dtypes

weathercode                  int64
temperature_2m_max         float64
temperature_2m_min         float64
shortwave_radiation_sum    float64
rain_sum                   float64
precipitation_hours        float64
windspeed_10m_max          float64
windgusts_10m_max          float64
city                        object
Month                       object
Wind_direction              object
dtype: object

In [None]:
df.head()

Unnamed: 0,weathercode,temperature_2m_max,temperature_2m_min,shortwave_radiation_sum,rain_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,city,Month,Wind_direction
0,2,30.0,22.7,20.92,0.0,0.0,11.7,27.4,Colombo,1,North
1,51,29.9,23.5,17.71,0.1,1.0,13.0,27.0,Colombo,1,Northeast
2,51,29.5,23.2,17.76,0.6,3.0,12.3,27.4,Colombo,1,North
3,2,28.9,21.9,16.5,0.0,0.0,17.0,34.6,Colombo,1,North
4,1,28.1,21.3,23.61,0.0,0.0,18.7,37.1,Colombo,1,North


In [None]:
df.columns

Index(['weathercode', 'temperature_2m_max', 'temperature_2m_min',
       'shortwave_radiation_sum', 'rain_sum', 'precipitation_hours',
       'windspeed_10m_max', 'windgusts_10m_max', 'city', 'Month',
       'Wind_direction'],
      dtype='object')

In [None]:
df.isnull().sum()

weathercode                0
temperature_2m_max         0
temperature_2m_min         0
shortwave_radiation_sum    0
rain_sum                   0
precipitation_hours        0
windspeed_10m_max          0
windgusts_10m_max          0
city                       0
Month                      0
Wind_direction             0
dtype: int64

### Splitting the dataset


In [None]:
# Define the features (X) and the target variable (y)
X = df.drop(columns=['weathercode'])  # Drop the target variable
y = df['weathercode']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# select the columns to be encoded
cols_to_encode = ['city','Month','Wind_direction']

# One-hot encode the 'city' variable on both training and testing sets
encoded_cities_train = pd.get_dummies(X_train[cols_to_encode], prefix_sep='_', drop_first=True)
encoded_cities_test = pd.get_dummies(X_test[cols_to_encode], prefix_sep='_', drop_first=True)

# Drop the 'city' column from the original X_train and X_test
X_train = X_train.drop(cols_to_encode, axis = 1)
X_test = X_test.drop(cols_to_encode, axis = 1)

# Concatenate the one-hot encoded city columns with the rest of the features for both sets
X_train_encoded = pd.concat([X_train, encoded_cities_train], axis=1)
X_test_encoded = pd.concat([X_test, encoded_cities_test], axis=1)

# Initialize the StandardScaler
scaler = StandardScaler()

# List of numerical columns (excluding 'Month' and 'wind direction')
numerical_columns = ['temperature_2m_max', 'temperature_2m_min', 'precipitation_hours', 'shortwave_radiation_sum', 'rain_sum', 'windspeed_10m_max', 'windgusts_10m_max']

# Fit the scaler on the training data and transform both training and testing sets for numerical columns
X_train_scaled = X_train_encoded.copy()  # Create a copy to avoid modifying the original data
X_test_scaled = X_test_encoded.copy()

X_train_scaled[numerical_columns] = scaler.fit_transform(X_train_scaled[numerical_columns])
X_test_scaled[numerical_columns] = scaler.transform(X_test_scaled[numerical_columns])


In [None]:
X_test_scaled.columns

Index(['temperature_2m_max', 'temperature_2m_min', 'shortwave_radiation_sum',
       'rain_sum', 'precipitation_hours', 'windspeed_10m_max',
       'windgusts_10m_max', 'city_Badulla', 'city_Bentota', 'city_Colombo',
       'city_Galle', 'city_Gampaha', 'city_Hambantota', 'city_Hatton',
       'city_Jaffna', 'city_Kalmunai', 'city_Kalutara', 'city_Kandy',
       'city_Kesbewa', 'city_Kolonnawa', 'city_Kurunegala', 'city_Mabole',
       'city_Maharagama', 'city_Mannar', 'city_Matale', 'city_Matara',
       'city_Moratuwa', 'city_Mount Lavinia', 'city_Negombo', 'city_Oruwala',
       'city_Pothuhera', 'city_Puttalam', 'city_Ratnapura',
       'city_Sri Jayewardenepura Kotte', 'city_Trincomalee', 'city_Weligama',
       'Month_10', 'Month_11', 'Month_12', 'Month_2', 'Month_3', 'Month_4',
       'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9',
       'Wind_direction_North', 'Wind_direction_Northeast',
       'Wind_direction_Northwest', 'Wind_direction_South',
       'Wind_direction_S

In [None]:
X_test_scaled.isna().sum()

temperature_2m_max                0
temperature_2m_min                0
shortwave_radiation_sum           0
rain_sum                          0
precipitation_hours               0
windspeed_10m_max                 0
windgusts_10m_max                 0
city_Badulla                      0
city_Bentota                      0
city_Colombo                      0
city_Galle                        0
city_Gampaha                      0
city_Hambantota                   0
city_Hatton                       0
city_Jaffna                       0
city_Kalmunai                     0
city_Kalutara                     0
city_Kandy                        0
city_Kesbewa                      0
city_Kolonnawa                    0
city_Kurunegala                   0
city_Mabole                       0
city_Maharagama                   0
city_Mannar                       0
city_Matale                       0
city_Matara                       0
city_Moratuwa                     0
city_Mount Lavinia          

## Multinomial Logistic Regression

In [None]:
# Initialize the multinomial logistic regression model
model1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Train the model on the scaled training data
model1.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [None]:
# Make predictions on the scaled training data
y_train_pred = model1.predict(X_train_scaled)

precision = precision_score(y_train_pred, y_train, average='weighted')
recall = recall_score(y_train_pred, y_train, average='weighted')
f1 = f1_score(y_train_pred, y_train, average='weighted')

print(f"Precision:, {precision:.4f}")
print(f"Recall:, {recall:.4f}")
print(f"F1-Score:, {f1:.4f}")

# Calculate accuracy for test
accuracy = accuracy_score(y_train_pred, y_train)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report for model
print("Classification Report:")
print(classification_report(y_train, y_train_pred))

# Generate a confusion matrix for model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

Precision:, 0.7530
Recall:, 0.7076
F1-Score:, 0.7264
Accuracy: 0.7076
Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.01      0.01       815
           1       0.56      0.66      0.61      7555
           2       0.51      0.60      0.56      8387
           3       0.64      0.38      0.47      4689
          51       0.87      0.94      0.90     30105
          53       0.64      0.69      0.66     17730
          55       0.30      0.04      0.07      6198
          61       0.61      0.71      0.66     18147
          63       0.79      0.78      0.78     22014
          65       0.77      0.42      0.54      2344

    accuracy                           0.71    117984
   macro avg       0.59      0.52      0.53    117984
weighted avg       0.69      0.71      0.69    117984

Confusion Matrix:
[[    6   763    40     6     0     0     0     0     0     0]
 [   13  4997  2398   147     0     0     0     0     0     0]
 [  

In [None]:
# Make predictions on the scaled testing data
y_pred = model1.predict(X_test_scaled)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Calculate accuracy for model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report for model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate a confusion matrix for model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Precision: 0.6887538450259109
Recall: 0.7062652563059398
F1-Score: 0.6875303490430733
Accuracy: 0.7063
Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.01      0.02       209
           1       0.56      0.64      0.60      1883
           2       0.50      0.60      0.55      2001
           3       0.65      0.39      0.49      1188
          51       0.87      0.94      0.90      7454
          53       0.65      0.70      0.67      4578
          55       0.32      0.05      0.08      1596
          61       0.61      0.70      0.65      4543
          63       0.79      0.78      0.78      5463
          65       0.74      0.41      0.53       581

    accuracy                           0.71     29496
   macro avg       0.59      0.52      0.53     29496
weighted avg       0.69      0.71      0.69     29496

Confusion Matrix:
[[   2  195   11    1    0    0    0    0    0    0]
 [   1 1206  632   44    0    0    0    0   

In [None]:
#Print model parameters
#print('Intercept: \n', model1.intercept_)
#print('Coefficients: \n', model1.coef_)


## Balancing Techniques

### Resampling

In [None]:
# Sampling techniques
sampling_methods = {
    'Random Undersampling': RandomUnderSampler(random_state=42),
    'Random Oversampling': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# Initialize the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

for method_name, sampler in sampling_methods.items():
    # Resample the training data
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_scaled, y_train)

    # Train the model
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print results
    print(f"Results after {method_name}:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("Classification Report:\n", classification_report_imbalanced(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")


Results after Random Undersampling:
Accuracy: 0.5986574450772986
Precision: 0.6401648972545032
Recall: 0.5986574450772986
F1-Score: 0.6107336127491608
Classification Report:
                    pre       rec       spe        f1       geo       iba       sup

          0       0.15      0.77      0.97      0.25      0.86      0.73       209
          1       0.44      0.35      0.97      0.39      0.58      0.32      1883
          2       0.44      0.38      0.96      0.41      0.61      0.35      2001
          3       0.40      0.62      0.96      0.49      0.77      0.57      1188
         51       0.82      0.78      0.94      0.80      0.86      0.73      7454
         53       0.61      0.53      0.94      0.57      0.71      0.48      4578
         55       0.28      0.46      0.93      0.35      0.66      0.41      1596
         61       0.60      0.55      0.93      0.57      0.71      0.49      4543
         63       0.80      0.62      0.97      0.70      0.77      0.58     

### Adjusting Class Weights

In [None]:
# Initialize and fit the model with automatic class weight adjustment
model2 = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced', max_iter=1000)
model2.fit(X_train_scaled, y_train)
y_pred_model2 = model2.predict(X_test_scaled)

#generating results
accuracy_model2 = accuracy_score(y_test, y_pred_model2)
report_model2 = classification_report(y_test, y_pred_model2)
confusion_matrix_model2 = confusion_matrix(y_test, y_pred_model2)
precision = precision_score(y_test, y_pred_model2, average='weighted')
recall = recall_score(y_test, y_pred_model2, average='weighted')
f1 = f1_score(y_test, y_pred_model2, average='weighted')

#Print the results
print("Class Weights Adjusted Model:")
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Accuracy:", accuracy_model2)
print("Classification Report:\n", report_model2)
print("Confusion Matrix:\n", confusion_matrix_model2)

Class Weights Adjusted Model:
Precision: 0.695873693115155
Recall: 0.657309465690263
F1-Score: 0.6671361356164036
Accuracy: 0.657309465690263
Classification Report:
               precision    recall  f1-score   support

           0       0.16      0.75      0.26       209
           1       0.53      0.39      0.45      1883
           2       0.52      0.38      0.44      2001
           3       0.50      0.64      0.57      1188
          51       0.90      0.91      0.90      7454
          53       0.70      0.64      0.67      4578
          55       0.29      0.54      0.38      1596
          61       0.61      0.55      0.58      4543
          63       0.83      0.63      0.71      5463
          65       0.41      0.83      0.55       581

    accuracy                           0.66     29496
   macro avg       0.54      0.63      0.55     29496
weighted avg       0.70      0.66      0.67     29496

Confusion Matrix:
 [[ 156   42    5    6    0    0    0    0    0    0]
 [ 

## Hyperparameter Tuning (Grid Serch CV)

In [None]:
#Define Parameters to Tune:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

#Initialize Grid Search and Fit Models:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2)
grid_search.fit(X_train_scaled, y_train)

#Get Best Parameters and Best Estimator:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

#Evaluate Best Model:
y_pred_best = best_estimator.predict(X_test_scaled)
accuracy_best = accuracy_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best, average='weighted')
f1_best = f1_score(y_test, y_pred_best, average='weighted')
report_best = classification_report(y_test, y_pred_best)
confusion_matrix_best = confusion_matrix(y_test, y_pred_best)
print(f'Precision: {precision:.2f}')
print("Best Model:")
print("Best Parameters:", best_params)
print("Best Accuracy:", accuracy_best)
print("Recall:", recall_best)
print("F1-Score:", f1_best)
print("Classification Report:\n", report_best)
print("Confusion Matrix:\n", confusion_matrix_best)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l2; total time=  14.8s
[CV] END .................................C=0.01, penalty=l2; total time=  13.7s
[CV] END .................................C=0.01, penalty=l2; total time=  13.8s
[CV] END .................................C=0.01, penalty=l2; total time=  14.0s
[CV] END .................................C=0.01, penalty=l2; total time=  14.7s
[CV] END ..................................C=0.1, penalty=l1; total time=   0.0s
[CV] END ..................................C=0.1