# Part I - Data Exploration Analysis

In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

In [2]:
# ================== functions =========================================
# 1. Data types
def data_types_list(data):
    # Get data types of all colums
    data_types = data.dtypes

    # 
    datetime_columns = []
    categorical_columns = []
    numerical_columns = []
    # Separate columns into categorical and numeric
    for col in data.columns:
        if data[col].dtype == 'datetime64[ns]':
            datetime_columns.append(col)
        elif data[col].dtype == 'object':
            categorical_columns.append(col)
        else:
            numerical_columns.append(col)
            
    return datetime_columns,categorical_columns, numerical_columns
    
    #date_time_columns = data_types[data.dtypes == 'datetime64[ns]'].index.tolist()
    #categorical_columns = data_types[data_types == 'object'].index.tolist()
    #numerical_columns = data_types[(data_types != 'object') or (data_types != 'datetime64[ns]')].index.tolist()
    

# 2. Are there null values in dataset
def data_null_check(data):
    
    if data.isnull().any().any():
        # calculate the total number of rows in the data
        total_rows = len(data)
        # create an empty list called 'columns_with_null' to store information about columns with nill values
        columns_with_null = []
        # Loop through each column to count the number of null value and calculate the percentage of null values relative to the total number of rows.
        for column in data.columns:
            null_count = data[column].isnull().sum()
            if null_count >0:
                columns_with_null.append(column, null_count, null_count/total_rows*100)
        # convert 'columns_with_null' to a new DataFrame called 'columns_with_null_df' for easier viewing
        columns_with_null_df = pd.DataFrame(columns_with_null, columns = ['Column N','Null Count','Null Percentage'])
    
        # Return result
        return columns_with_null_df
        print(columns_with_null_df)
    
    else:
        print('There are no null values in the dataset')
# . Results
def show_results(data):
    # 1. Data_types => return 2 lists of categorical and numeric features in the dataset
    datetime_columns, categorical_columns, numerical_columns = data_types_list(data)
    print('1. Overview the structure of dataset')
    print(' - Total columns in the dataset: {} , in which:'.format(data.shape[1]))
    print(' - Datetime Columns: {} are list in a variable name - datetime_columns '.format(len(datetime_columns)))
    print(' - Categorical Columns: {} are list in a variable name - categorical_columns '.format(len(categorical_columns)))
    print(' - Numeric Columns    : {} are list in a variable name - numerical_columns '.format(len(numerical_columns)))
    print('----------------------------------------')
    print('2. Check null values')
    columns_with_null_df = data_null_check(data)
    print(columns_with_null_df)


In [3]:
# Load data set
dataset = 'df_day.csv'
day_weather = pd.read_csv(dataset)
day_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      335 non-null    object 
 1   maxtemp_c       335 non-null    float64
 2   maxtemp_f       335 non-null    float64
 3   mintemp_c       335 non-null    float64
 4   mintemp_f       335 non-null    float64
 5   avgtemp_c       335 non-null    float64
 6   avgtemp_f       335 non-null    float64
 7   maxwind_mph     335 non-null    float64
 8   maxwind_kph     335 non-null    float64
 9   totalprecip_mm  335 non-null    float64
 10  totalprecip_in  335 non-null    float64
 11  avgvis_km       335 non-null    float64
 12  avgvis_miles    335 non-null    int64  
 13  avghumidity     335 non-null    int64  
 14  condition       335 non-null    object 
 15  uv              335 non-null    int64  
dtypes: float64(11), int64(3), object(2)
memory usage: 42.0+ KB


In [4]:
# Rename the 'Unnamed: 0' column to 'date'
day_weather.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
day_weather['date'] = pd.to_datetime(day_weather['date'])
day_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            335 non-null    datetime64[ns]
 1   maxtemp_c       335 non-null    float64       
 2   maxtemp_f       335 non-null    float64       
 3   mintemp_c       335 non-null    float64       
 4   mintemp_f       335 non-null    float64       
 5   avgtemp_c       335 non-null    float64       
 6   avgtemp_f       335 non-null    float64       
 7   maxwind_mph     335 non-null    float64       
 8   maxwind_kph     335 non-null    float64       
 9   totalprecip_mm  335 non-null    float64       
 10  totalprecip_in  335 non-null    float64       
 11  avgvis_km       335 non-null    float64       
 12  avgvis_miles    335 non-null    int64         
 13  avghumidity     335 non-null    int64         
 14  condition       335 non-null    object        
 15  uv    

In [5]:
# describe the dataset (len, shape)

shape = day_weather.shape
print('-----------------------------')
print('The structure of the dataset:')
print('-----------------------------')
print('The number of observations: ', shape[0])
print('the number of features: ', shape[1])
print('')
# Execute the step: summary statistics
sum_stats = day_weather.describe()
print('Summary Statistics:\n',sum_stats)

-----------------------------
The structure of the dataset:
-----------------------------
The number of observations:  335
the number of features:  16

Summary Statistics:
         maxtemp_c   maxtemp_f   mintemp_c   mintemp_f   avgtemp_c   avgtemp_f  \
count  335.000000  335.000000  335.000000  335.000000  335.000000  335.000000   
mean    32.901194   91.205373   24.399701   75.917910   27.817015   82.071642   
std      2.868495    5.159443    1.761506    3.168025    1.775510    3.193229   
min     24.200000   75.600000   19.600000   67.300000   23.000000   73.500000   
25%     31.200000   88.150000   23.300000   73.900000   26.600000   79.900000   
50%     32.600000   90.700000   24.300000   75.700000   27.900000   82.100000   
75%     34.500000   94.150000   25.700000   78.300000   29.050000   84.250000   
max     40.100000  104.200000   28.800000   83.800000   32.300000   90.100000   

       maxwind_mph  maxwind_kph  totalprecip_mm  totalprecip_in   avgvis_km  \
count   335.000000

In [6]:
# ================= results ===============================================
show_results(day_weather)

1. Overview the structure of dataset
 - Total columns in the dataset: 16 , in which:
 - Datetime Columns: 1 are list in a variable name - datetime_columns 
 - Categorical Columns: 1 are list in a variable name - categorical_columns 
 - Numeric Columns    : 14 are list in a variable name - numerical_columns 
----------------------------------------
2. Check null values
There are no null values in the dataset
None


# Forecast weather condition

## Create a baseline model

In [218]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [219]:
day_weather

Unnamed: 0,date,maxtemp_c,maxtemp_f,mintemp_c,mintemp_f,avgtemp_c,avgtemp_f,maxwind_mph,maxwind_kph,totalprecip_mm,totalprecip_in,avgvis_km,avgvis_miles,avghumidity,condition,uv
0,2022-10-01,28.9,83.9,24.2,75.6,25.6,78.1,5.8,9.4,8.4,0.33,8.5,5,90,Moderate or heavy rain shower,6
1,2022-10-02,27.3,81.1,23.9,75.0,25.0,77.1,4.9,7.9,6.7,0.26,7.9,4,92,Moderate rain at times,6
2,2022-10-03,31.5,88.7,23.8,74.8,26.9,80.4,7.2,11.5,9.0,0.35,9.9,6,82,Moderate or heavy rain shower,7
3,2022-10-04,31.6,88.9,24.2,75.6,26.9,80.4,6.7,10.8,4.6,0.18,9.8,6,83,Heavy rain at times,7
4,2022-10-05,31.3,88.3,24.0,75.2,27.1,80.7,8.1,13.0,12.9,0.51,10.0,6,81,Moderate or heavy rain shower,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,2023-08-27,33.2,91.8,26.1,79.0,28.8,83.8,10.5,16.9,8.6,0.34,9.8,6,74,Light rain shower,7
331,2023-08-28,34.6,94.3,25.7,78.3,28.7,83.6,13.6,22.0,4.7,0.19,9.8,6,74,Light rain shower,7
332,2023-08-29,33.8,92.8,25.6,78.1,28.7,83.7,12.5,20.2,14.4,0.57,9.7,6,72,Moderate or heavy rain shower,7
333,2023-08-30,32.7,90.9,25.0,77.0,28.5,83.2,12.1,19.4,3.1,0.12,9.9,6,73,Light rain shower,7


In [253]:
distinct_conditions = day_weather['condition'].unique()
print(distinct_conditions)

['Moderate or heavy rain shower' 'Moderate rain at times'
 'Heavy rain at times' 'Partly cloudy' 'Patchy rain possible' 'Overcast'
 'Cloudy' 'Sunny' 'Light rain shower']


In [244]:
# Step 2: Prepare the data
# Select features (e.g., temperature, humidity, wind speed) and the target variable (weather condition)
#features = day_weather[['maxtemp_c', 'avghumidity', 'maxwind_kph']]
features = day_weather.drop(['condition','date'], axis=1)
target = day_weather['condition']

In [245]:
# Step 3: Split the data into training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [246]:
# Step 4: Build the Decision Tree Model
# Create a Decision Tree model
model = DecisionTreeClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [247]:
# Step 5: Make Preditions
y_pred = model.predict(X_test)

In [248]:
# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6567164179104478


In [249]:
day_weather_add_pred = pd.DataFrame(pd.concat([day_weather,pd.DataFrame({'y_pred':y_pred})],axis=1))
#print(day_weather_add_pred.head(5))
#print(day_weather_add_pred.tail(5))
day_weather_add_pred.head(5)

Unnamed: 0,date,maxtemp_c,maxtemp_f,mintemp_c,mintemp_f,avgtemp_c,avgtemp_f,maxwind_mph,maxwind_kph,totalprecip_mm,totalprecip_in,avgvis_km,avgvis_miles,avghumidity,condition,uv,y_pred
0,2022-10-01,28.9,83.9,24.2,75.6,25.6,78.1,5.8,9.4,8.4,0.33,8.5,5,90,Moderate or heavy rain shower,6,Moderate or heavy rain shower
1,2022-10-02,27.3,81.1,23.9,75.0,25.0,77.1,4.9,7.9,6.7,0.26,7.9,4,92,Moderate rain at times,6,Partly cloudy
2,2022-10-03,31.5,88.7,23.8,74.8,26.9,80.4,7.2,11.5,9.0,0.35,9.9,6,82,Moderate or heavy rain shower,7,Cloudy
3,2022-10-04,31.6,88.9,24.2,75.6,26.9,80.4,6.7,10.8,4.6,0.18,9.8,6,83,Heavy rain at times,7,Moderate or heavy rain shower
4,2022-10-05,31.3,88.3,24.0,75.2,27.1,80.7,8.1,13.0,12.9,0.51,10.0,6,81,Moderate or heavy rain shower,7,Moderate or heavy rain shower


## Using XGBoost model

In [None]:
# Step 1: Import neccesary librabries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score

#  Step 2: Encode the 'condition' column with numerical labels
label_encoder = LabelEncoder()
day_weather['condition_encoded'] = label_encoder.fit_transform(day_weather['condition'])

# Step 3: Define your features and target
X = day_weather.drop(['condition','condition_encoded','date'], axis=1)
y = day_weather['condition_encoded']

# Step 4: Split the data into training and testing sets
X_train_XGB, X_test_XGB, y_train_XGB, y_test_XGB = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_XGB, y_train_XGB)

# Make predictions
xgb_predictions = xgb_model.predict(X_test_XGB)

# Decode the predictions back to original labels
xgb_predictions_decoded = label_encoder.inverse_transform(xgb_predictions)

# Calculate accuracy scores
xgb_accuracy = accuracy_score(y_test_XGB, xgb_predictions)
print(f'XGBoost Accuracy: {xgb_accuracy}')

In [254]:
# Step 1: Import neccesary librabries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [276]:
# Step 3: Define your features and target
X = day_weather.drop(['condition','condition_encoded','date'], axis=1)
y = day_weather['condition_encoded']

In [259]:
# Step 4: Split the data into training and testing sets
X_train_XGB, X_test_XGB, y_train_XGB, y_test_XGB = train_test_split(X, y, test_size=0.2, random_state=42)

In [260]:
# Train an XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_XGB, y_train_XGB)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [261]:
# Make predictions
xgb_predictions = xgb_model.predict(X_test_XGB)

In [262]:
# Decode the predictions back to original labels
xgb_predictions_decoded = label_encoder.inverse_transform(xgb_predictions)

In [264]:
# Calculate accuracy scores
xgb_accuracy = accuracy_score(y_test_XGB, xgb_predictions)
print(f'XGBoost Accuracy: {xgb_accuracy}')

XGBoost Accuracy: 0.7313432835820896


In [282]:
y_pred_prob_XGB

array([7.1019602e-01, 1.0166485e-04, 7.1777515e-03, 3.5381636e-03,
       1.8136176e-01, 3.4984879e-03, 2.8448252e-03, 2.0324417e-04,
       1.1887125e-03, 5.8543351e-02, 4.4780897e-04, 6.4935006e-02,
       3.1244985e-04, 1.7945523e-03, 5.6190579e-04, 5.0684478e-04,
       2.9206480e-04, 2.7685304e-04, 4.0529476e-04, 1.5155153e-03,
       1.3066878e-03, 2.6038397e-04, 9.5953111e-04, 2.3678724e-01,
       9.4257474e-01, 1.9734947e-04, 5.1357285e-03, 1.0067774e-03,
       9.1714336e-04, 8.9904014e-03, 5.5433944e-04, 2.6110010e-02,
       3.5971072e-01, 1.2174237e-03, 5.7720337e-03, 1.4776360e-03,
       1.0702189e-03, 4.8784753e-03, 6.3027218e-02, 1.7875786e-01,
       3.1758424e-02, 4.3633617e-03, 5.1042665e-04, 2.9268991e-02,
       1.1040721e-02, 5.0041103e-01, 1.2311463e-03, 1.9018742e-04,
       4.9017707e-01, 3.7015940e-03, 1.7588595e-03, 7.4069540e-05,
       1.8906742e-03, 6.0655773e-05, 1.0661612e-02, 6.4243026e-02,
       8.7995571e-04, 2.3829358e-04, 6.3587070e-01, 4.2943472e

## Using LightGBM model

In [266]:
# Train a LightGBM model
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train_XGB, y_train)

LGBMClassifier()

In [269]:
# Make predictions
lgb_predictions = lgb_model.predict(X_test_XGB)

# Decode the predictions back to original labels
lgb_predictions_decoded = label_encoder.inverse_transform(lgb_predictions)

# Calculate accuracy scores
lgb_accuracy = accuracy_score(y_test_XGB, lgb_predictions)
print(f'LightGBM Accuracy: {lgb_accuracy}')

LightGBM Accuracy: 0.7164179104477612


## Improve the Model

To use K-Fold cross-validation to build an optimal XGBoost model, you need to perform hyperparameter tuning during the cross-validation process. 

## 1. Find the best model

In [290]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

#  Step 2: Encode the 'condition' column with numerical labels
label_encoder = LabelEncoder()
day_weather['condition_encoded'] = label_encoder.fit_transform(day_weather['condition'])

# Step 3: Define your features and target
X = day_weather.drop(['condition','condition_encoded','date'], axis=1)
y = day_weather['condition_encoded']

# Define the hyperparameters grid for XGBoost.

param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [3, 4, 5],
             'learning_rate': [0.05, 0.1, 0.2]
             }

# Create a K-Fold cross-validation iterator
num_folds = 5  # Number of cross-validation folds
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a GridSearchCV object to perform hyperparameter tuning within each fold
xgb_model = xgb.XGBRegressor()
grid_search = GridSearchCV(xgb_model, param_grid, cv=kf, scoring='neg_mean_squared_error')

# Iteration through the folds, perform hyperparameter tuning, and train XGBoost models with optimal hyperparameters

best_models = []  # Store the best models for each fold

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Perform hyperparameter tuning using GridSearchCV
    grid_search.fit(X_train, y_train)

    # Get the best model with optimal hyperparameters
    best_model = grid_search.best_estimator_
    best_models.append(best_model)

    # Train the best model on the full training data
    best_model.fit(X_train, y_train)
    
# Evaluate the performance of optimal models and select the best one
best_mse = np.inf
best_model = None

for model in best_models:
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_model = model

print("Best Model MSE:", best_mse)

Best Model MSE: 0.6653898459360889


Now we have a trained XGBoost model with optimal hyperparameters selected through K-Fold cross-validation. This model should provide improved performance compared to using default hyperparameters.

## 2. Calculate the accuracy

In [294]:
# Calculate the mean of the true target values (y)
y_mean = y.mean()

# Calculate the variance of the true target values (y)
variance_y = ((y - y_mean) ** 2).mean()

# Calculate the accuracy
accuracy = 1 - best_mse / variance_y

print("Accuracy:", accuracy)

Accuracy: 0.8756272893734554


The best model with optimal hyperparameters selected throuh K-Fold cross-validation show the better accuracy and better MSE

## 3. Estimatie the next 30 days