In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
data = pd.read_csv("weather_data.csv", sep = ";")
data[:]

Unnamed: 0,time,month,temperature,feelslike,wind,direction,gust,cloud,humidity,precipitation,pressure,weather
0,0:00,1,24.0,28.0,8.0,ENE,12.0,4.0,86.0,0.0,1012.0,Clear
1,3:00,1,23.0,27.0,8.0,NE,10.0,4.0,88.0,0.0,1011.0,Clear
2,6:00,1,23.0,26.0,8.0,NNE,11.0,7.0,85.0,0.0,1012.0,Sunny
3,9:00,1,28.0,33.0,11.0,NNE,13.0,6.0,64.0,0.0,1012.0,Sunny
4,12:00,1,31.0,35.0,10.0,ENE,12.0,62.0,53.0,0.0,1010.0,Partly cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...
8507,9:00,11,29.0,32.0,4.0,ENE,5.0,17.0,68.0,0.0,1012.0,Partly cloudy
8508,12:00,11,33.0,37.0,5.0,ENE,5.0,15.0,52.0,0.0,1011.0,Partly cloudy
8509,15:00,11,34.0,38.0,3.0,ENE,4.0,19.0,50.0,0.0,1009.0,Partly cloudy
8510,18:00,11,31.0,34.0,6.0,SE,8.0,19.0,65.0,0.0,1009.0,Partly cloudy


In [3]:
data["target"] = data.shift(-1)["weather"]
data[:]
print(data['direction'].unique())

['ENE' 'NE' 'NNE' 'ESE' 'E' 'SE' 'SSE' 'NNW' 'WSW' 'S' 'WNW' 'NW' 'W' 'SW'
 'SSW' 'N']


In [4]:
data = data.drop("weather", axis=1)
data = data.drop("precipitation", axis=1)    # Too many values 0 that leads to overfitting

# Change from 16 directions to 8 directions
data['direction'] = data['direction'].replace(['ENE', 'NE', 'NNE'], 'NE')
data['direction'] = data['direction'].replace(['ESE', 'SE', 'SSE'], 'SE')
data['direction'] = data['direction'].replace(['NNW', 'NW', 'WNW'], 'NW')
data['direction'] = data['direction'].replace(['WSW', 'SW', 'SSW'], 'SW')

# Change from time to 2 sessions a day
data['time'] = data['time'].replace(['0:00', '3:00', '6:00', '9:00'], 'earlier')
data['time'] = data['time'].replace(['12:00', '15:00', '18:00', '21:00'], 'later')

# Change from month to seasons
data['month'] = data['month'].replace([1, 2, 3], 'spring')
data['month'] = data['month'].replace([4, 5, 6], 'summer')
data['month'] = data['month'].replace([7, 8, 9], 'autumn')
data['month'] = data['month'].replace([10, 11, 12], 'winter')

In [5]:
data.drop(8511, inplace=True)

In [6]:
data = data.rename(columns={'month': 'season'})

In [7]:
list_norain = ['Clear', 'Cloudy', 'Mist', 'Sunny', 'Partly cloudy', 'Thundery outbreaks possible', 'Overcast']
list_hvrain = ['Patchy rain possible', 'Heavy rain', 'Heavy rain at times', 'Moderate or heavy rain shower', 'Torrential rain shower']
list_lrain = ['Light drizzle', 'Light rain', 'Light rain shower', 'Patchy light drizzle', 'Patchy light rain', 
                'Patchy light rain with thunder', 'Moderate rain', 'Moderate rain at times']

y = []
for i in data.target:
    if i in list_norain:
        y.append('There is no rain')
    elif i in list_hvrain:
        y.append('There is heavy rain')
    elif i in list_lrain:
        y.append('There is light or little rain')

In [8]:
data = data.rename(columns={'month': 'season'})
x = data[['time', 'season', 'temperature', 'feelslike', 'wind', 'direction',
       'gust', 'cloud', 'humidity', 'pressure']].values
x[0:10]

array([['earlier', 'spring', 24.0, 28.0, 8.0, 'NE', 12.0, 4.0, 86.0,
        1012.0],
       ['earlier', 'spring', 23.0, 27.0, 8.0, 'NE', 10.0, 4.0, 88.0,
        1011.0],
       ['earlier', 'spring', 23.0, 26.0, 8.0, 'NE', 11.0, 7.0, 85.0,
        1012.0],
       ['earlier', 'spring', 28.0, 33.0, 11.0, 'NE', 13.0, 6.0, 64.0,
        1012.0],
       ['later', 'spring', 31.0, 35.0, 10.0, 'NE', 12.0, 62.0, 53.0,
        1010.0],
       ['later', 'spring', 31.0, 36.0, 1.0, 'NE', 1.0, 79.0, 57.0,
        1009.0],
       ['later', 'spring', 26.0, 29.0, 5.0, 'SE', 10.0, 40.0, 77.0,
        1010.0],
       ['later', 'spring', 25.0, 28.0, 8.0, 'NE', 15.0, 29.0, 79.0,
        1011.0],
       ['earlier', 'spring', 24.0, 26.0, 10.0, 'NE', 17.0, 12.0, 84.0,
        1011.0],
       ['earlier', 'spring', 23.0, 26.0, 8.0, 'NE', 14.0, 17.0, 85.0,
        1010.0]], dtype=object)

In [9]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

num_orders = [2,3,4,6,7,8,9]
cat_orders = [0,1,5]

In [10]:
le = LabelEncoder()

le.fit(['There is no rain', 'There is heavy rain', 'There is light or little rain'])
y = le.transform(y) 

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 1)

In [12]:
# Pipeline for categorical data handling
cat_pl= Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')),    # Handle missing values with mean value though in this dataset we don't have
           ('onehot', OneHotEncoder(handle_unknown= 'ignore'))]     # Transform categorical data into onehot vectors
        ) 

# Pipeline for numerical data handling
num_pl = Pipeline(
    steps=[
           ('imputer', SimpleImputer(strategy='mean')),             # Handle missing values with forward value though in this dataset we don't have
           ('mmscaler', MinMaxScaler())]                            # Normalization with MinMaxScaling
         )

In [13]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
                                        ('num', num_pl, num_orders),    # apply cat_pl for categorical columns
                                        ('cat', cat_pl, cat_orders)]    # apply num_pl for numerical columns
               )

In [14]:
# Completed training pipeline
completed_pl = Pipeline(steps=[
            ("preprocessor", preprocessor), 
            ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True))]
               )

# training
completed_pl.fit(x_train, y_train)

# accuracy
y_train_pred = completed_pl.predict(x_train)
print(f"Accuracy on train: {accuracy_score(list(y_train), list(y_train_pred)):.2f}")

y_pred = completed_pl.predict(x_test)
print(f"Accuracy on test: {accuracy_score(list(y_test), list(y_pred)):.2f}")

Accuracy on train: 0.81
Accuracy on test: 0.80
