## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings("ignore")

color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

# Modelling
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import f1_score, accuracy_score

# Saving the model
import joblib

## Load the Data

In [None]:
df = pd.read_csv("../input/khulna-weather-dataset/khulna.csv")

df.shape

## Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.info()

Columns `datetime`, `sunrise` and `sunset` are in object data type, we need to convert them to datetime

In [None]:
# Convert the date columns to datetime objects

df['datetime'] = pd.to_datetime(df['datetime'])
df['sunrise'] = pd.to_datetime(df['sunrise'])
df['sunset'] = pd.to_datetime(df['sunset'])

In [None]:
df.info()

In [None]:
df.columns

In [None]:
## Dropping some columns, which have no relevance

df.drop(columns=['name', 'snow'], axis=1, inplace=True)

df = df.copy()

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
## Check for missing values

df.isnull().sum()

In [None]:
## Checking the % of missing values 

# Plotting Missing values count for each column
fig, ax = plt.subplots(figsize=(15,5))

missing = df.isna().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending = False)

ax.bar(missing.index, missing.values.T[0])
plt.xticks([])
plt.ylabel("Percentage missing")
plt.show();

Here, we can see that there are features which have more than 80% missing values, we dont find any relevance of those features, let's drop them.

In [None]:
## Dropping columns which has more than 70% of missing values

dropcols = missing[missing[0]>70]
dropcols

In [None]:
## Dropping the columns with more than 70% missing values

df.drop(list(dropcols.index), axis=1, inplace=True)

df = df.copy()

In [None]:
df.shape

In [None]:
df.isnull().sum()

We still have some missing values, we will handle them later directly in the Pipeline.

In [None]:
# Defining a function to check the summary of the data
def summary(df):
    data=pd.DataFrame(index=df.columns)
    data['dtypes']=df.dtypes
    data['count']=df.count()
    data['#unique']=df.nunique()
    data['duplicate'] = df.duplicated().sum()
    data['#missing']=df.isna().sum()
    data['missing%']=df.isna().sum()/len(df)*100
    data=pd.concat([data,df.describe().T.drop('count',axis=1)],axis=1)
    return data

In [None]:
summary(df).style.background_gradient(cmap='YlGnBu')

We can see duplicates also in the dataset, let's remove them from the dataset

In [None]:
# Remove duplicate rows
df = df.drop_duplicates()

df.copy()

In [None]:
summary(df).style.background_gradient(cmap='YlGnBu')

Now there are no duplicates, let's proceed. 

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(df, x="conditions")
plt.show();

In [None]:
# Create a mapping for binning
weather_mapping = {
    'Rain, Partially cloudy': 'Rainy Weather',
    'Rain': 'Rainy Weather',
    'Rain, Overcast': 'Rainy Weather',
    'Rain, Fog': 'Rainy Weather',
    'Partially cloudy': 'Partially Cloudy',
    'Clear': 'Clear Weather',
    'Overcast': 'Overcast'
}

df['conditions'] = df['conditions'].map(weather_mapping)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(df, x="conditions")
plt.show();

In [None]:
## Mapping the risk level
risk_mapping = {
    'Partially Cloudy': 0,
    'Rainy Weather': 1,
    'Clear Weather': 2,
    'Overcast': 3
}

df["conditions"] = df["conditions"].map(risk_mapping)


In [None]:
## Dropping some other columns

df.drop(columns=["datetime", "preciptype", "sunrise", "sunset"], axis=1, inplace=True)

df = df.copy()


## Defining Numerical & Categorical features

In [None]:
X = df.drop(columns=["conditions"], axis=1)

y = df["conditions"]

In [None]:
numeric_features = [feature for feature in X.columns if df[feature].dtype != 'O']

categorical_features = [feature for feature in X.columns if df[feature].dtype == 'O']

In [None]:
print(f"Numerical Features are: ", numeric_features)

print("------------------------------------------------------------------------------")

print(f"Categorical Features are: ", categorical_features)

In [None]:
## Checking the shape of the data

df.shape

In [None]:
## Impute the categorical columns

impute_cat_cols = [x for x in df.isna().sum()[df.isna().sum() != 0].index.tolist()]
for feat in impute_cat_cols:
    mode = df[feat].mode().iloc[0]
    df[feat].fillna(mode, inplace=True)
    
    
## Impute for numerical columns

impute_num_cols = [x for x in df.select_dtypes(include=['float64', 'int64']).isna().sum()[df.select_dtypes(include=['float64', 'int64']).isna().sum() != 0].index.tolist()]
for feat in impute_num_cols:
    median = df[feat].median()
    df[feat].fillna(median, inplace=True)

In [None]:
plt.figure(figsize=(15, 100))
for i, col in enumerate(numeric_features):
    plt.subplot(60, 3, i+1)
    sns.distplot(x=df[col], color='indianred')
    plt.xlabel(col, weight='bold')
    plt.tight_layout()

In [None]:
# checking the target
plt.figure(figsize=(12,4))
# barplot
ax1 = plt.subplot(1,2,1)
cp = sns.countplot(x=df["conditions"])
ax1.set_xlabel(" ")
ax1.set_ylabel(" ")
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
sns.despine(top=True, right=True)
# pieplot
ax2 = plt.subplot(1,2,2)
plt.pie(df["conditions"].value_counts(),
        labels=list(df["conditions"].unique()),
        autopct='%1.2f%%',
        pctdistance=0.8,
        shadow=True,
        radius=1.3,
        textprops={'fontsize':14}
       )
ax2.set_xlabel(" ")
plt.xlabel('Composition of "conditions "', fontsize=15, labelpad=20)
plt.subplots_adjust(wspace=0.4)
plt.show()

## Checking Multicollinearity

In [None]:
def remove_multicollinearity_and_save(data, threshold=0.8, save_path='data.csv'):
    # Compute the correlation matrix
    correlation_matrix = data.corr()

    # Create a mask for the upper triangle
    mask = (correlation_matrix.abs() > threshold) & (correlation_matrix != 1)

    # Find multicollinear feature pairs
    multicollinear_features = set()
    for col in correlation_matrix.columns:
        correlated_cols = correlation_matrix.columns[mask[col]]
        for correlated_col in correlated_cols:
            multicollinear_features.add(col)
            multicollinear_features.add(correlated_col)
            print(f'Multicollinear Features: {col} and {correlated_col} (Correlation: {correlation_matrix.loc[col, correlated_col]:.2f})')

    # Remove multicollinear features
    new_data = data.drop(columns=multicollinear_features, axis=1)

    # Save the new DataFrame with the name 'df'
    new_data.to_csv(save_path, index=False)
    print(f'New DataFrame saved to {save_path} with the name "df"')

    # Plot the heatmap
    plt.figure(figsize=(25, 12))
    sns.heatmap(correlation_matrix, annot=True, linewidths=.5, mask=mask)
    plt.title('Correlation Matrix Heatmap')
    plt.show()

In [None]:
remove_multicollinearity_and_save(df)

In [None]:
df = pd.read_csv("/kaggle/working/data.csv")

In [None]:
df.head()

In [None]:
X = df.drop(columns=["conditions"], axis=1)

y = df["conditions"]

In [None]:
numeric_features = X.select_dtypes(exclude="object").columns
numeric_features

There are no categorical features

In [None]:
# Creating a ColumnTransformer
transformer = ColumnTransformer(transformers=[
    ("numeric", Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features)
], remainder="drop")

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = transformer.fit_transform(X_train)

X_test = transformer.transform(X_test)

In [None]:
X_train.shape, X_test.shape

## Baseline Modelling

In [None]:
def evaluate_model(true, predicted):
    score_f1 = f1_score(true, predicted, average="weighted")
    accuracy = accuracy_score(true, predicted)

    return score_f1, accuracy

In [None]:
def fit_classification_models(X, y, test_size=0.2, random_state=42):

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Create pipelines for XGBoost, RandomForest, and ExtraTrees classifiers
    pipelines = {
        'XGBoost': Pipeline([('scaler', StandardScaler()), ('classifier', XGBClassifier())]),
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())]),
        'ExtraTreesClassifier': Pipeline([('scaler', StandardScaler()), ('classifier', ExtraTreesClassifier())]),
    }

    # Fit and evaluate XGBoost, RandomForest, and ExtraTrees classifiers
    results = {}

    for classifier_name, pipeline in pipelines.items():
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")
        results[classifier_name] = {'accuracy': accuracy, 'f1-score': f1}

    return results

In [None]:
results = fit_classification_models(X_train, y_train)

In [None]:
# Create a DataFrame from the results
df = pd.DataFrame(results).transpose()

# Sort the DataFrame by the F1-score in descending order
df_sorted = df.sort_values(by='f1-score', ascending=False)

# Display the sorted DataFrame
df_sorted

In [None]:
model = XGBClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

## Save the Model

In [None]:
# Save the model to an HDF5 file using joblib
filename = 'khulna_model.h5'
joblib.dump(model, filename)


# Load the model from the HDF5 file
loaded_model = joblib.load('khulna_model.h5')

In [None]:
# Make Prediction

y_new_pred = loaded_model.predict(X_test)

# Display or use the predictions
print("Predictions on new data:", y_new_pred)