In [96]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import joblib
import json

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [97]:
path = "data\weathersense_gather.csv"
original_data = pd.read_csv(path)
data = original_data.copy()

display(original_data.head())
display(data.head())

  path = "data\weathersense_gather.csv"


Unnamed: 0,id,ts,temp_sensor,humidity_sensor,temp_api,humidity_api,pressure,wind_speed,cloudiness,weather
0,1,2024-04-18 17:04:21,20,50,36.88,57,1004,5.66,20,Clouds
1,2,2024-04-18 17:14:22,20,47,36.38,58,1004,5.66,20,Clouds
2,3,2024-04-18 17:24:22,20,50,35.8,58,1003,5.66,20,Clouds
3,4,2024-04-18 17:34:22,20,49,35.8,58,1003,5.66,20,Clouds
4,5,2024-04-18 17:44:23,20,49,35.49,59,1003,5.66,20,Clouds


Unnamed: 0,id,ts,temp_sensor,humidity_sensor,temp_api,humidity_api,pressure,wind_speed,cloudiness,weather
0,1,2024-04-18 17:04:21,20,50,36.88,57,1004,5.66,20,Clouds
1,2,2024-04-18 17:14:22,20,47,36.38,58,1004,5.66,20,Clouds
2,3,2024-04-18 17:24:22,20,50,35.8,58,1003,5.66,20,Clouds
3,4,2024-04-18 17:34:22,20,49,35.8,58,1003,5.66,20,Clouds
4,5,2024-04-18 17:44:23,20,49,35.49,59,1003,5.66,20,Clouds


# Preprocessing

In [98]:
data.dtypes

id                   int64
ts                  object
temp_sensor          int64
humidity_sensor      int64
temp_api           float64
humidity_api         int64
pressure             int64
wind_speed         float64
cloudiness           int64
weather             object
dtype: object

In [99]:
data['ts'] = pd.to_datetime(data['ts'])
display(data.head())

Unnamed: 0,id,ts,temp_sensor,humidity_sensor,temp_api,humidity_api,pressure,wind_speed,cloudiness,weather
0,1,2024-04-18 17:04:21,20,50,36.88,57,1004,5.66,20,Clouds
1,2,2024-04-18 17:14:22,20,47,36.38,58,1004,5.66,20,Clouds
2,3,2024-04-18 17:24:22,20,50,35.8,58,1003,5.66,20,Clouds
3,4,2024-04-18 17:34:22,20,49,35.8,58,1003,5.66,20,Clouds
4,5,2024-04-18 17:44:23,20,49,35.49,59,1003,5.66,20,Clouds


In [100]:
data["day_of_week"] = data["ts"].dt.dayofweek
data["month"] = data["ts"].dt.month
data["hour"] = data["ts"].dt.hour

data.drop("ts", axis=1, inplace=True)
display(data.head())

Unnamed: 0,id,temp_sensor,humidity_sensor,temp_api,humidity_api,pressure,wind_speed,cloudiness,weather,day_of_week,month,hour
0,1,20,50,36.88,57,1004,5.66,20,Clouds,3,4,17
1,2,20,47,36.38,58,1004,5.66,20,Clouds,3,4,17
2,3,20,50,35.8,58,1003,5.66,20,Clouds,3,4,17
3,4,20,49,35.8,58,1003,5.66,20,Clouds,3,4,17
4,5,20,49,35.49,59,1003,5.66,20,Clouds,3,4,17


In [101]:
X = data.drop(["weather"], axis=1)
y = data["weather"]

In [102]:
display(X)
display(y)

Unnamed: 0,id,temp_sensor,humidity_sensor,temp_api,humidity_api,pressure,wind_speed,cloudiness,day_of_week,month,hour
0,1,20,50,36.88,57,1004,5.66,20,3,4,17
1,2,20,47,36.38,58,1004,5.66,20,3,4,17
2,3,20,50,35.80,58,1003,5.66,20,3,4,17
3,4,20,49,35.80,58,1003,5.66,20,3,4,17
4,5,20,49,35.49,59,1003,5.66,20,3,4,17
...,...,...,...,...,...,...,...,...,...,...,...
153,154,19,55,38.36,32,1007,2.57,20,5,4,12
154,155,19,55,38.36,32,1007,2.57,20,5,4,12
155,156,18,55,38.36,32,1007,2.57,20,5,4,13
156,157,18,55,39.27,29,1006,3.09,20,5,4,13


0      Clouds
1      Clouds
2      Clouds
3      Clouds
4      Clouds
        ...  
153    Clouds
154    Clouds
155    Clouds
156    Clouds
157    Clouds
Name: weather, Length: 158, dtype: object

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [104]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
display(y_train_encoded)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [105]:
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)

In [106]:
display(X_train)
display(y_train_encoded)

array([[0.85987261, 0.92307692, 0.77358491, ..., 1.        , 0.        ,
        0.39130435],
       [0.28025478, 0.92307692, 1.        , ..., 0.5       , 0.        ,
        0.39130435],
       [0.8089172 , 0.69230769, 0.26415094, ..., 1.        , 0.        ,
        0.08695652],
       ...,
       [0.45859873, 0.53846154, 0.52830189, ..., 0.5       , 0.        ,
        0.65217391],
       [0.89171975, 0.23076923, 0.45283019, ..., 1.        , 0.        ,
        0.43478261],
       [0.23566879, 0.76923077, 0.41509434, ..., 0.5       , 0.        ,
        0.04347826]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [107]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=1)
rf_classifier.fit(X_train, y_train_encoded)

In [108]:
predictions_encoded = rf_classifier.predict(X_test)
predictions = label_encoder.inverse_transform(predictions_encoded)
display(predictions)



array(['Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds',
       'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds',
       'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds',
       'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds',
       'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds', 'Clouds',
       'Clouds', 'Clouds'], dtype=object)

In [109]:
joblib.dump(rf_classifier, 'random_foresting_model.pkl')
joblib.dump(label_encoder, 'label_encoder_model.pkl')
joblib.dump(min_max_scaler, 'min_max_scaler_model.pkl')

['min_max_scaler_model.pkl']