# Weather Prediction

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

In [18]:
# use GPU for training.
# I run this on Google Colab

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Brief Data Analysis of Training Set

In [105]:
df = pd.read_csv('drive/MyDrive/weather_deep_learning/weather_train.csv')
df.head()

Unnamed: 0,ID,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,W9972,23.0,69,9.5,10.0,clear,1012.91,9,Spring,8.5,inland,Sunny
1,W501,29.0,82,18.0,78.0,overcast,991.54,3,Spring,4.0,inland,Rainy
2,W9187,-2.0,74,3.0,74.0,overcast,982.42,0,Winter,2.5,inland,Snowy
3,W6418,36.0,48,3.5,11.0,partly cloudy,1024.92,10,Autumn,5.5,mountain,Sunny
4,W5603,32.0,35,5.5,9.0,partly cloudy,1010.24,6,Summer,9.0,coastal,Sunny


In [20]:
df.shape

(11220, 12)

In [21]:
df.dtypes

Unnamed: 0,0
ID,object
Temperature,float64
Humidity,int64
Wind Speed,float64
Precipitation (%),float64
Cloud Cover,object
Atmospheric Pressure,float64
UV Index,int64
Season,object
Visibility (km),float64


In [22]:
df.isnull().sum()  # very clean

Unnamed: 0,0
ID,0
Temperature,0
Humidity,0
Wind Speed,0
Precipitation (%),0
Cloud Cover,0
Atmospheric Pressure,0
UV Index,0
Season,0
Visibility (km),0


In [106]:
df['Weather Type'].value_counts()  # perfectly balanced

Unnamed: 0_level_0,count
Weather Type,Unnamed: 1_level_1
Sunny,2805
Rainy,2805
Snowy,2805
Cloudy,2805


In [23]:
# see Categorical data
cloud_cover = len(df['Cloud Cover'].unique())
season = len(df['Season'].unique())
location = len(df['Location'].unique())
target = len(df['Weather Type'].unique())

print(f'types count of cloud cover: {cloud_cover}')
print(f'types count of season: {season}')
print(f'types count of location: {location}')
print()
print(f'types count of target (season type): {target}')

types count of cloud cover: 4
types count of season: 4
types count of location: 3

types count of target (season type): 4


## One-hot Encoding & Normalization of the Dataframe

In [24]:
one_hot_df = pd.get_dummies(df[['Cloud Cover', 'Season', 'Location']], dtype=int)

id = df['ID']
target = df['Weather Type']

trunk = df.drop(['Cloud Cover', 'Season', 'Location', 'Weather Type', 'ID'], axis=1)
normalized_df = (trunk-trunk.min())/(trunk.max()-trunk.min())
X = pd.concat([normalized_df, one_hot_df], axis=1)
X = pd.concat([id, X], axis=1)

Y = target.replace({'Sunny': 0, 'Cloudy': 1, 'Rainy': 2, 'Snowy': 3})

X.head()

  Y = target.replace({'Sunny': 0, 'Cloudy': 1, 'Rainy': 2, 'Snowy': 3})


Unnamed: 0,ID,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km),Cloud Cover_clear,Cloud Cover_cloudy,Cloud Cover_overcast,Cloud Cover_partly cloudy,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Location_coastal,Location_inland,Location_mountain
0,W9972,0.358209,0.550562,0.2,0.091743,0.533188,0.642857,0.425,1,0,0,0,0,1,0,0,0,1,0
1,W501,0.402985,0.696629,0.378947,0.715596,0.479641,0.214286,0.2,0,0,1,0,0,1,0,0,0,1,0
2,W9187,0.171642,0.606742,0.063158,0.678899,0.456789,0.0,0.125,0,0,1,0,0,0,0,1,0,1,0
3,W6418,0.455224,0.314607,0.073684,0.100917,0.563281,0.714286,0.275,0,0,0,1,1,0,0,0,0,0,1
4,W5603,0.425373,0.168539,0.115789,0.082569,0.526498,0.428571,0.45,0,0,0,1,0,0,1,0,1,0,0


In [25]:
Y.head()

Unnamed: 0,Weather Type
0,0
1,2
2,3
3,0
4,0


## Train-Validation Set Split

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train.drop('ID', axis=1, inplace=True)
X_val.drop('ID', axis=1, inplace=True)

## Training with Neural Network

In [27]:
_, input = X_train.shape
input

18

In [99]:
# build nn
class simpleNN(nn.Module):
  def __init__(self):
    super(simpleNN, self).__init__()

    self.linear = nn.Linear(input, 64)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(64, 32)
    # self.tanh = nn.Tanh()
    self.linear3 = nn.Linear(32, 4)
    self.dropout = nn.Dropout(0.1)
    self.softmax = nn.Softmax(dim=1)
    self.logsoftmax = nn.LogSoftmax(dim=1)

  def forward(self, x):
    x = self.linear(x)
    x = self.relu(x)
    x = self.linear2(x)
    x = self.softmax(x)
    x = self.linear3(x)
    # x = self.dropout(x)
    x = self.logsoftmax(x)
    return x

model = simpleNN()
model = model.to('cuda')

In [61]:
# build dataset

class Dataset(torch.utils.data.Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

dataset = Dataset(torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(y_train.values, dtype=int))

In [100]:
# how to load data
batch_size = 32

DataLoaders = DataLoader(dataset, batch_size = batch_size, shuffle = False)

# loss function: class
loss = nn.NLLLoss()

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [101]:
epochs = 50

for i in range(epochs):
  for batch, (data, target) in enumerate(DataLoaders):
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()

    output = model(data)
    # print(type(target))
    train_loss_value = loss(output, target)
    train_loss_value.backward()
    optimizer.step()

    if batch % 100 == 0:
      val_output = model(torch.tensor(X_val.values, dtype=torch.float32).to(device))
      val_loss_value = loss(val_output, torch.tensor(y_val.values, dtype=int).to(device))

      print(f'Epoch: {i+1}, Batch: {batch+1}, Train Loss: {train_loss_value.item()}')
      print(f'Epoch: {i+1}, Batch: {batch+1}, Validation Loss: {val_loss_value.item()}')
      print()

Epoch: 1, Batch: 1, Train Loss: 1.384965419769287
Epoch: 1, Batch: 1, Validation Loss: 1.3866589069366455

Epoch: 1, Batch: 101, Train Loss: 1.3680691719055176
Epoch: 1, Batch: 101, Validation Loss: 1.3751416206359863

Epoch: 1, Batch: 201, Train Loss: 1.3077558279037476
Epoch: 1, Batch: 201, Validation Loss: 1.3020418882369995

Epoch: 2, Batch: 1, Train Loss: 1.1879719495773315
Epoch: 2, Batch: 1, Validation Loss: 1.2142298221588135

Epoch: 2, Batch: 101, Train Loss: 1.0922119617462158
Epoch: 2, Batch: 101, Validation Loss: 1.1286453008651733

Epoch: 2, Batch: 201, Train Loss: 1.0689966678619385
Epoch: 2, Batch: 201, Validation Loss: 1.0798614025115967

Epoch: 3, Batch: 1, Train Loss: 1.0077331066131592
Epoch: 3, Batch: 1, Validation Loss: 1.047979474067688

Epoch: 3, Batch: 101, Train Loss: 0.9726382493972778
Epoch: 3, Batch: 101, Validation Loss: 1.0118319988250732

Epoch: 3, Batch: 201, Train Loss: 0.9689931273460388
Epoch: 3, Batch: 201, Validation Loss: 0.9794407486915588

Epoch:

## Save and Load Model for Test Set

In [102]:
# save
torch.save(model.state_dict(), 'model.pth')

In [103]:
# load
model.load_state_dict(torch.load('model.pth'))

# predict test set
test = pd.read_csv('drive/MyDrive/weather_deep_learning/weather_test.csv')
id = test['ID']

one_hot_test = pd.get_dummies(test[['Cloud Cover', 'Season', 'Location']], dtype=int)
trunk = test.drop(['Cloud Cover', 'Season', 'Location', 'ID'], axis=1)
normalized_test = (trunk-trunk.min())/(trunk.max()-trunk.min())
X_test = pd.concat([normalized_test, one_hot_test], axis=1)

preds = model(torch.tensor(X_test.values, dtype=torch.float32).to(device))
preds = torch.argmax(preds, dim=1)
preds = preds.cpu().detach().numpy()

preds = pd.DataFrame(preds, columns=['Weather Type'])
preds.shape

  model.load_state_dict(torch.load('model.pth'))


(1980, 1)

In [107]:
preds.replace({0: 'Sunny', 1: 'Cloudy', 2: 'Rainy', 3: 'Snowy'}, inplace=True)
pd.concat([id, preds], axis=1).to_csv('submission.csv', index=False)

# test accuracy (Kaggle Public Score): 0.8958