#### 1. import libraries 

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split , cross_val_score , GridSearchCV
from sklearn.preprocessing import StandardScaler , LabelEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

import warnings
warnings.filterwarnings('ignore')

#### 2. load the dataset

In [None]:


df = pd.read_csv("dataset.csv")

#### 3. Explore the dataset (EDA)

In [None]:

df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [82]:
df.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [83]:
#  check unique values in each column
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print("-" * 30)


Column: Temperature
[ 14.  39.  30.  38.  27.  32.  -2.   3.  28.  35.  12. -10.  24.  10.
  33.  43.  13.  -7.  26.   4.  17.  40.   2.  15.  29.  11.  -9.  36.
  42.  21.  22.  25.  -4.  -1.  -5.  41.  31.  16.  34.  49.  19.  23.
  20.  -3.  18.   1.   0.  46.  44. -13.  -6.  78.  63.  73.   8. -12.
 -24.  -8.  60.  48.   5.  51. -14.  50.  37.  54.  47.  70.   9.  66.
 -16. -15.  59.  80. -19.  52.  45.   6. -18. -11.  74.  76.  55. -20.
  57.  91.  82. -17.  61.   7.  53.  65.  77.  67.  64.  58.  68.  72.
  62.  71.  56. 107. -22.  75.  85.  97.  84. -21.  92. -25.  81. 109.
  98.  94.  90. -23.  88.  99.  69. 100.  89. 102.  86. 108.  87.  95.]
------------------------------
Column: Humidity
[ 73  96  64  83  74  55  97  85  45  43  59  87  21  50  27  51  46 102
  67  88  36  79  72  57  61  70  95  69  90 105  49  37  22  54  66  25
  91  98  94  41  84  63  75  52  89  47  81  62  31  68  35  78  56  93
  44  38  24  82  65  80  39  48  60  29  99  92  76  77  86  32  58  42


#### 4. check for missing values 

In [None]:

df.isnull().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

#### 5. Convert categorical features into numerical form

In [85]:
# check categorical columns
categorical_cols = []

for col in df.columns:
    if df[col].dtype == 'object':
        categorical_cols.append(col)

categorical_cols


['Cloud Cover', 'Season', 'Location', 'Weather Type']

In [86]:
# Dictionary to store encoders for each column
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le  # store encoder
    
    # Print mapping
    print(f"\nLabel Encoding for Column: {col}")
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    for k, v in mapping.items():
        print(f"'{k}' -> {v}")
    print("-" * 30)


Label Encoding for Column: Cloud Cover
'clear' -> 0
'cloudy' -> 1
'overcast' -> 2
'partly cloudy' -> 3
------------------------------

Label Encoding for Column: Season
'Autumn' -> 0
'Spring' -> 1
'Summer' -> 2
'Winter' -> 3
------------------------------

Label Encoding for Column: Location
'coastal' -> 0
'inland' -> 1
'mountain' -> 2
------------------------------

Label Encoding for Column: Weather Type
'Cloudy' -> 0
'Rainy' -> 1
'Snowy' -> 2
'Sunny' -> 3
------------------------------


In [87]:
df.dtypes

Temperature             float64
Humidity                  int64
Wind Speed              float64
Precipitation (%)       float64
Cloud Cover               int64
Atmospheric Pressure    float64
UV Index                  int64
Season                    int64
Visibility (km)         float64
Location                  int64
Weather Type              int64
dtype: object

#### 6. Feature selection using Wrapper Method: RFE (Recursive Feature Elimination)

In [88]:
X = df.drop("Weather Type", axis=1)
y = df["Weather Type"]


In [89]:

cv_scores = {}

for n in range(4, 11):
    rf = RandomForestClassifier(random_state=42)
    rfe = RFE(estimator=rf, n_features_to_select=n)
    
    X_rfe = rfe.fit_transform(X, y)
    
    scores = cross_val_score(rf, X_rfe, y, cv=5, scoring='accuracy')
    cv_scores[n] = scores.mean()
    
    print(f"Number of Features: {n}, CV Accuracy: {scores.mean():.4f}")


Number of Features: 4, CV Accuracy: 0.8810
Number of Features: 5, CV Accuracy: 0.8988
Number of Features: 6, CV Accuracy: 0.9086
Number of Features: 7, CV Accuracy: 0.9111
Number of Features: 8, CV Accuracy: 0.9097
Number of Features: 9, CV Accuracy: 0.9147
Number of Features: 10, CV Accuracy: 0.9131


In [90]:
best_n_features = max(cv_scores, key=cv_scores.get)
best_n_features


9

In [91]:
rfe = RFE(
    estimator=RandomForestClassifier(random_state=42),
    n_features_to_select=best_n_features
)

rfe.fit(X, y)

selected_features = X.columns[rfe.support_]
selected_features


Index(['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)',
       'Cloud Cover', 'Atmospheric Pressure', 'UV Index', 'Season',
       'Visibility (km)'],
      dtype='object')

In [92]:
X = X[selected_features]

#### 7. Split the data into training and testing sets

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)


#### 8. Apply normalization / standardization

In [94]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)   # fit on training
X_test_scaled = scaler.transform(X_test)         # transform test


#### 9. Model Selection

In [95]:
rf = RandomForestClassifier(random_state=42)

#### 10. Hyperparameter Tuning + Cross Validation

In [None]:


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train_scaled, y_train)


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], 'n_estimators': [100, 200]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### 11. Best Model

In [97]:
best_model = grid.best_estimator_
grid.best_params_


{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

#### 12. Model Evaluation

In [98]:
y_pred = best_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9132575757575757

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89       660
           1       0.90      0.91      0.90       660
           2       0.95      0.91      0.93       660
           3       0.95      0.92      0.93       660

    accuracy                           0.91      2640
   macro avg       0.91      0.91      0.91      2640
weighted avg       0.91      0.91      0.91      2640


Confusion Matrix:
 [[606  34  11   9]
 [ 35 601  14  10]
 [ 26  18 600  16]
 [ 30  17   9 604]]


#### 13. Save Model for Future Reuse

In [None]:


joblib.dump(best_model, "weather_rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(selected_features, "selected_features.pkl")
joblib.dump(encoders, "label_encoders.pkl")
print()
print("Model, scaler, selected features, and encoders saved successfully!")


Model, scaler, selected features, and encoders saved successfully!
