In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
train = pd.read_csv('data/train.csv')
print(train.shape)
train.head()

(2190, 13)


Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [14]:
test = pd.read_csv('data/test.csv')
print(test.shape)
test.head()

(730, 12)


Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [15]:
RMV_TRAIN = ['rainfall', 'id']
RMV_TEST = ['id']
FEATURES = [col for col in train.columns if col not in RMV_TRAIN]
TRAGET = 'rainfall'
TEST = [col for col in test.columns if col not in RMV_TEST]
print(FEATURES)

['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']


In [16]:
train.drop_duplicates(inplace=True)
train.isnull().sum()

id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64

In [17]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import AUC

In [19]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train[FEATURES])
X_test_scaled = scaler.transform(test[TEST])
X_train, X_val, y_train, y_val = train_test_split(X_scaled, train[TRAGET], test_size=0.2, random_state=42)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout, MaxPool1D, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
from tensorflow.keras.optimizers import SGD
optimizer = SGD(
    learning_rate=0.001,
    momentum=0.9,
    decay=1e-6,
)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[AUC(name='auc')])
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1)
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 25: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 35: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 36/100
Epoch 36: early stopping


In [22]:
test_preds = model.predict(X_test_scaled).flatten()
if np.isnan(test_preds).sum() > 0:
    print(f"Found {np.isnan(test_preds).sum()} NaN values in predictions. Fixing them...")
    test_preds = np.nan_to_num(test_preds)
submission = pd.DataFrame({'id': test['id'], 'rainfall': test_preds})
submission.to_csv('data/submission.csv', index=False)



In [23]:
submission.head()

Unnamed: 0,id,rainfall
0,2190,0.977889
1,2191,0.978699
2,2192,0.946014
3,2193,0.163925
4,2194,0.086036
