In [1]:
import numpy as np
import pandas as pd

In [2]:
weather_hist = pd.read_csv('weatherHistory.csv')

In [3]:

duplicates = weather_hist[weather_hist.duplicated()]

# Print the number of duplicate rows, if any
num_duplicates = len(duplicates)
if num_duplicates > 0:
    print("\n")
    print(f"{num_duplicates} duplicate rows found in Weathe History Dataset.")
    print("\n")
    print(duplicates)
else:
    print("No duplicate rows found in WeatherHistory.")



24 duplicate rows found in Weathe History Dataset.


                      Formatted Date        Summary Precip Type  \
36071  2010-08-02 00:00:00.000 +0200          Clear        rain   
36072  2010-08-02 01:00:00.000 +0200          Clear        rain   
36073  2010-08-02 02:00:00.000 +0200          Clear        rain   
36074  2010-08-02 03:00:00.000 +0200          Clear        rain   
36075  2010-08-02 04:00:00.000 +0200          Clear        rain   
36076  2010-08-02 05:00:00.000 +0200          Clear        rain   
36077  2010-08-02 06:00:00.000 +0200          Clear        rain   
36078  2010-08-02 07:00:00.000 +0200          Clear        rain   
36079  2010-08-02 08:00:00.000 +0200          Clear        rain   
36080  2010-08-02 09:00:00.000 +0200          Clear        rain   
36081  2010-08-02 10:00:00.000 +0200          Clear        rain   
36082  2010-08-02 11:00:00.000 +0200          Clear        rain   
36083  2010-08-02 12:00:00.000 +0200          Clear        rain   
36084  

In [4]:
# Removing duplicated values
weather_hist.drop_duplicates(inplace=True)
# Again checking for duplicated values
print("Duplicated Values: ", weather_hist.duplicated().sum())

Duplicated Values:  0


In [5]:
# Checking missing values

# Check for null values in the DataFrame
null_values = weather_hist.isnull().sum()

# Print the count of null values in each column
print('Checking the NULL Values in  WeatherHistory Dataset..\n')

# Check if there are any null values
if null_values.sum() == 0:
    print(f'\n There are no null values in the given dataset.')
else:
    columns_with_null_values = null_values[null_values > 0].index.tolist()
    # Loop over the columns with null values and print the count of null values for each column
    for column in columns_with_null_values:
        count = null_values[column]
        print(f"Column '{column}' has {count} null values.")

Checking the NULL Values in  WeatherHistory Dataset..

Column 'Precip Type' has 520 null values.
Column 'Wind Speed (km/h)' has 5 null values.
Column 'Visibility (km)' has 8 null values.
Column 'Pressure (millibars)' has 7 null values.


In [6]:
# Remove rows with missing values in 'Precip Type' column
weather_hist = weather_hist[(weather_hist['Precip Type'] != '?') & (weather_hist['Precip Type'] != '%')]

# Verify the changes
weather_hist['Precip Type'].value_counts()


rain    85191
snow    10712
Name: Precip Type, dtype: int64

In [7]:
# Re-check Handling Missing Values
missing_values = weather_hist.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 Formatted Date                0
Summary                       0
Precip Type                 520
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             5
Wind Bearing (degrees)        0
Visibility (km)               8
Loud Cover                    0
Pressure (millibars)          7
Daily Summary                 0
dtype: int64


In [8]:
# Remove the 'Daily Summary' column from the DataFrame
weather_hist = weather_hist.drop('Daily Summary', axis=1)


In [9]:
# Define a dictionary mapping current weather summaries to new categories
weather_mapping = {
   'Clear': 'Clear Skies',
    'Partly Cloudy': 'Cloudy Skies',
    'Mostly Cloudy': 'Cloudy Skies',
    'Overcast': 'Cloudy Skies',
    'Foggy': 'Foggy Day',
    'Breezy and Foggy': 'Foggy Day',
    'Windy and Foggy': 'Foggy Day',
    'Breezy': 'Breezy Day',
    'Breezy and Mostly Cloudy': 'Breezy Day',
    'Breezy and Partly Cloudy': 'Breezy Day',
    'Breezy and Overcast': 'Breezy Day',
    'Windy': 'Windy Day',
    'Windy and Mostly Cloudy': 'Windy Day',
    'Windy and Partly Cloudy': 'Windy Day',
    'Windy and Overcast': 'Windy Day',
    'Humid and Mostly Cloudy': 'Humid Day',
    'Humid and Partly Cloudy': 'Humid Day',
    'Humid and Overcast': 'Humid Day',
    'Dry': 'Dry Day',
    'Dry and Mostly Cloudy': 'Dry Day',
    'Dry and Partly Cloudy': 'Dry Day',
    'Light Rain': 'Rainy Day',
    'Drizzle': 'Rainy Day',
    'Rain': 'Rainy Day'
}

# Create a new column 'Weather Category' based on the mapping
weather_hist['Summary'] = weather_hist['Summary'].map(weather_mapping)

# Display the value counts of the new column
print(weather_hist['Summary'].value_counts())


Cloudy Skies    76415
Clear Skies     10872
Foggy Day        7187
Breezy Day       1484
Windy Day         155
Dry Day           134
Rainy Day         112
Humid Day          64
Name: Summary, dtype: int64


In [10]:
from datetime import datetime

# Define the format string for the date
format_string = "%Y-%m-%d %H:%M:%S.%f %z"

# Function to check if a value is in the correct format
def check_date_format(value):
    try:
        datetime.strptime(value, format_string)
        return True
    except ValueError:
        return False

# Apply the function to each value in the 'Formatted Date' column and create a new column 'is_valid_date'
weather_hist['is_valid_date'] = weather_hist['Formatted Date'].apply(check_date_format)

# Filter the DataFrame to show only rows where 'is_valid_date' is False
invalid_dates = weather_hist[~weather_hist['is_valid_date']]

# Display the rows with invalid dates
print("Rows with invalid dates:")
print(invalid_dates[['Formatted Date', 'is_valid_date']])


Rows with invalid dates:
Empty DataFrame
Columns: [Formatted Date, is_valid_date]
Index: []


In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Label encode object columns
object_columns = ['Summary', 'Precip Type']
for col in object_columns:
    weather_hist[col+'_encoded'] = label_encoder.fit_transform(weather_hist[col])

# Convert 'Formatted Date' to datetime format with utc=True
weather_hist['Formatted Date'] = pd.to_datetime(weather_hist['Formatted Date'], utc=True)

# Extract year, month, day, and hour
weather_hist['Year'] = weather_hist['Formatted Date'].dt.year
weather_hist['Month'] = weather_hist['Formatted Date'].dt.month
weather_hist['Day'] = weather_hist['Formatted Date'].dt.day
weather_hist['Hour'] = weather_hist['Formatted Date'].dt.hour

# Create dictionaries for mapping encoded values to original labels
summary_mapping = dict(zip(weather_hist['Summary_encoded'], weather_hist['Summary']))
precip_mapping = dict(zip(weather_hist['Precip Type_encoded'], weather_hist['Precip Type']))

# Display the updated DataFrame and the mapping dictionaries
print("Weather History DataFrame:")
weather_hist[['Summary', 'Precip Type', 'Summary_encoded', 'Precip Type_encoded', 'Year', 'Month', 'Day', 'Hour']].head()





Weather History DataFrame:


Unnamed: 0,Summary,Precip Type,Summary_encoded,Precip Type_encoded,Year,Month,Day,Hour
0,Cloudy Skies,rain,2,0,2006,3,31,22
1,Cloudy Skies,rain,2,0,2006,3,31,23
2,Cloudy Skies,rain,2,0,2006,4,1,0
3,Cloudy Skies,rain,2,0,2006,4,1,1
4,Cloudy Skies,rain,2,0,2006,4,1,2


In [12]:
# Assuming 'weather_hist' is your DataFrame

# Use factorize to encode 'Summary' and 'Precip Type' columns
weather_hist['Summary_encoded'], summary_mapping_index = weather_hist['Summary'].factorize()
weather_hist['Precip Type_encoded'], precip_mapping_index = weather_hist['Precip Type'].factorize()

# Display the mapping indexes
print("Summary Mapping Index:")
print(summary_mapping_index)

print("\nPrecip Type Mapping Index:")
print(precip_mapping_index)


Summary Mapping Index:
Index(['Cloudy Skies', 'Foggy Day', 'Breezy Day', 'Clear Skies', 'Humid Day',
       'Windy Day', 'Dry Day', 'Rainy Day'],
      dtype='object')

Precip Type Mapping Index:
Index(['rain', 'snow'], dtype='object')


In [13]:
# Assuming df is your DataFrame

# Get the unique values in the 'Summary' column before encoding
summary_unique_values = weather_hist['Summary'].unique()

# Get the unique values in the 'Precip Type' column before encoding
precip_unique_values = weather_hist['Precip Type'].unique()

# Display the unique values
print("Unique values in 'Summary' column before encoding:")
print(summary_unique_values)

print("\nUnique values in 'Precip Type' column before encoding:")
print(precip_unique_values)


Unique values in 'Summary' column before encoding:
['Cloudy Skies' 'Foggy Day' 'Breezy Day' 'Clear Skies' 'Humid Day'
 'Windy Day' 'Dry Day' 'Rainy Day']

Unique values in 'Precip Type' column before encoding:
['rain' nan 'snow']


In [14]:
# Drop original categorical columns
weather_hist = weather_hist.drop(columns=['Summary', 'Precip Type', 'Formatted Date'])

# Display the updated DataFrame
print("Encoded Weather History DataFrame:")
weather_hist.head()


Encoded Weather History DataFrame:


Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),is_valid_date,Summary_encoded,Precip Type_encoded,Year,Month,Day,Hour
0,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,True,0,0,2006,3,31,22
1,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,True,0,0,2006,3,31,23
2,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,True,0,0,2006,4,1,0
3,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,True,0,0,2006,4,1,1
4,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,True,0,0,2006,4,1,2


In [15]:
# Assuming df is your DataFrame
columns_to_remove = ['is_valid_date']
weather_hist = weather_hist.drop(columns=columns_to_remove)

# Display the updated DataFrame
weather_hist.head()


Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Summary_encoded,Precip Type_encoded,Year,Month,Day,Hour
0,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,0,0,2006,3,31,22
1,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,0,0,2006,3,31,23
2,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,0,0,2006,4,1,0
3,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,0,0,2006,4,1,1
4,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,0,0,2006,4,1,2


In [16]:
# Rename the columns
weather_hist = weather_hist.rename(columns={'Summary_encoded': 'Summary', 'Precip Type_encoded': 'Precip Type'})

# Display the updated DataFrame
print("Updated Weather History DataFrame:")
weather_hist.head()


Updated Weather History DataFrame:


Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Summary,Precip Type,Year,Month,Day,Hour
0,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,0,0,2006,3,31,22
1,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,0,0,2006,3,31,23
2,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,0,0,2006,4,1,0
3,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,0,0,2006,4,1,1
4,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,0,0,2006,4,1,2


In [17]:
missing_values = weather_hist.isnull().sum()
print("Missing values:")
missing_values


Missing values:


Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           5
Wind Bearing (degrees)      0
Visibility (km)             8
Loud Cover                  0
Pressure (millibars)        7
Summary                     0
Precip Type                 0
Year                        0
Month                       0
Day                         0
Hour                        0
dtype: int64

In [18]:
duplicate_rows = weather_hist[weather_hist.duplicated()]
print("\nDuplicate rows:")
duplicate_rows



Duplicate rows:


Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Summary,Precip Type,Year,Month,Day,Hour


In [19]:
null_values = weather_hist.isna().sum()
print("\nNull values:")
null_values



Null values:


Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           5
Wind Bearing (degrees)      0
Visibility (km)             8
Loud Cover                  0
Pressure (millibars)        7
Summary                     0
Precip Type                 0
Year                        0
Month                       0
Day                         0
Hour                        0
dtype: int64

In [20]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import optuna
from sklearn.model_selection import cross_val_score

# Split the dataset into features (X) and target variable (y)
X = weather_hist.drop(['Summary'], axis=1)
y = weather_hist['Summary']



# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def objective(trial):
    param = {
        'objective': 'multi:softmax',
        'num_class': 8,
        'eval_metric': 'mlogloss',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'verbosity': 0
    }

    model = XGBClassifier(**param)
    scores = cross_val_score(model, X, y, cv=5)
    accuracy = scores.mean()
    return accuracy

In [21]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-04-07 09:47:11,125] A new study created in memory with name: no-name-d44b0e16-07bc-4a7e-aa0e-10282963d679
[I 2024-04-07 09:47:29,696] Trial 0 finished with value: 0.5625766468614513 and parameters: {'eta': 0.21303105011619095, 'max_depth': 7, 'min_child_weight': 7, 'subsample': 0.7161423909739228, 'colsample_bytree': 0.5963130802598842, 'gamma': 0.14216896277218122}. Best is trial 0 with value: 0.5625766468614513.
[I 2024-04-07 09:47:49,791] Trial 1 finished with value: 0.6877096099474487 and parameters: {'eta': 0.013588155018901656, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 0.8492770435853396, 'colsample_bytree': 0.6020379933372533, 'gamma': 0.24176381317042112}. Best is trial 1 with value: 0.6877096099474487.
[I 2024-04-07 09:48:16,348] Trial 2 finished with value: 0.5189654199012756 and parameters: {'eta': 0.2596895165579383, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.7252815960000816, 'colsample_bytree': 0.845308857150419, 'gamma': 0.11005317289975336}

[I 2024-04-07 09:58:37,620] Trial 26 finished with value: 0.6364667806460125 and parameters: {'eta': 0.05047459725462048, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8716791991206305, 'colsample_bytree': 0.7060773412178093, 'gamma': 0.08295305012093263}. Best is trial 12 with value: 0.720099117501713.
[I 2024-04-07 09:59:02,182] Trial 27 finished with value: 0.6806467486227317 and parameters: {'eta': 0.024264045598104182, 'max_depth': 4, 'min_child_weight': 9, 'subsample': 0.9428108196322648, 'colsample_bytree': 0.5267313037587191, 'gamma': 0.03656256669376784}. Best is trial 12 with value: 0.720099117501713.
[I 2024-04-07 09:59:31,200] Trial 28 finished with value: 0.5507930809148485 and parameters: {'eta': 0.08455291310955762, 'max_depth': 8, 'min_child_weight': 8, 'subsample': 0.8222026536518506, 'colsample_bytree': 0.623176882032771, 'gamma': 0.16572480876181106}. Best is trial 12 with value: 0.720099117501713.
[I 2024-04-07 09:59:56,029] Trial 29 finished with value: 0.6

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

# Use the best hyperparameters found by Optuna
best_params = study.best_params
model = XGBClassifier(**best_params)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Generate confusion matrices
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Generate classification reports
train_classification_report = classification_report(y_train, y_train_pred)
test_classification_report = classification_report(y_test, y_test_pred)

# Print the results
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nTrain Confusion Matrix:")
print(train_conf_matrix)

print("\nTest Confusion Matrix:")
print(test_conf_matrix)

print("\nTrain Classification Report:")
print(train_classification_report)

print("\nTest Classification Report:")
print(test_classification_report)


Train Accuracy: 0.8896393476626306
Test Accuracy: 0.8864920922997148

Train Confusion Matrix:
[[60912    58    85   126     0     0     0     0]
 [   50  5691     0     3     0     0     0     0]
 [    2     0  1158     0     0    21     1     0]
 [ 7878    27     3   751     0     0     1     0]
 [   49     0     0     0     0     0     0     0]
 [    0     1    60     0     0    60     0     0]
 [   86     0     0     2     0     0    21     0]
 [   56     4     0     0     0     0     0    32]]

Test Confusion Matrix:
[[15154    19    30    29     0     0     0     2]
 [   15  1427     0     0     0     0     0     1]
 [    4     0   286     0     0    12     0     0]
 [ 1998     6     0   207     0     0     1     0]
 [   15     0     0     0     0     0     0     0]
 [    0     0    22     0     0    12     0     0]
 [   19     0     0     0     0     0     6     0]
 [   16     0     0     0     0     0     0     4]]

Train Classification Report:
              precision    recall 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
import pickle
filename = 'savemodel.sav'
pickle.dump(model, open(filename, 'wb'))

In [24]:
load_model = pickle.load(open(filename, 'rb'))

In [27]:

# Make predictions
predictions = model.predict([[9.5, 7.4, 0.8, 10.2, 200, 
               16.0, 0, 1016.0, 0, 2024, 4, 1, 3]])
