In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv("weatherAUS.csv") 

In [6]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [7]:
df.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

In [8]:
df=df.dropna(subset = ['RainTomorrow'])

In [9]:
X = df[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
        'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
        'Temp9am', 'Temp3pm','RainToday']]
y = df['RainTomorrow']

In [10]:
print(df.columns)

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')


In [11]:
df.isnull().sum()

Date                 0
Location             0
MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday         1406
RainTomorrow         0
dtype: int64

In [12]:
# Handle missing values and encode categorical variables
X['RainToday'].fillna('No', inplace=True)
X['RainToday'] = LabelEncoder().fit_transform(X['RainToday'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['RainToday'].fillna('No', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['RainToday'] = LabelEncoder().fit_transform(X['RainToday'])


In [13]:
# Convert target variable to numeric format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [14]:
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [15]:
sequence_length = 10  
sequences = []
labels = []

for i in range(len(X) - sequence_length):
    sequence = X[i:i + sequence_length]
    target = y[i + sequence_length]
    sequences.append(sequence)
    labels.append(target)

X = np.array(sequences)
y = np.array(labels)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
X_train = X_train.reshape((X_train.shape[0], sequence_length, X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], sequence_length, X_test.shape[2]))


In [18]:
model_lstm = Sequential()
model_lstm.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))

model_lstm.add(LSTM(50, return_sequences=True))

model_lstm.add(LSTM(50))
model_lstm.add(Dense(1, activation='sigmoid'))

In [19]:
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [20]:
model_lstm.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2b1c834fad0>

In [21]:
accuracy = model_lstm.evaluate(X_test, y_test)[1]
print(f'Accuracy on Test Set (LSTM): {accuracy:.4%}')

Accuracy on Test Set (LSTM): 77.5926%


In [22]:
from sklearn.metrics import precision_score, matthews_corrcoef


In [23]:
y_prob_lstm = model_lstm.predict(X_test)
y_pred_lstm = (y_prob_lstm > 0.5).astype(int)



In [26]:
threshold = 0.5
y_pred_lstm = (y_prob_lstm > threshold).astype(int)


In [27]:
from sklearn.metrics import confusion_matrix

# Assuming y_test is the true labels and y_pred_lstm is the predicted labels
y_prob_lstm = model_lstm.predict(X_test)
y_pred_lstm = (y_prob_lstm > 0.5).astype(int)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_lstm)

print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[22065     0]
 [ 6372     0]]


In [29]:
from sklearn.metrics import confusion_matrix, precision_score, matthews_corrcoef

# Assuming you have predictions from your LSTM model stored in y_prob_lstm
y_prob_lstm = model_lstm.predict(X_test)
y_pred_lstm = (y_prob_lstm > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix_lstm = confusion_matrix(y_test, y_pred_lstm)
print("Confusion Matrix:")
print(conf_matrix_lstm)

# Calculate precision
precision_lstm = precision_score(y_test, y_pred_lstm)
print(f'Precision: {precision_lstm:.4f}')

# Calculate MCC
mcc_lstm = matthews_corrcoef(y_test, y_pred_lstm)
print(f'MCC: {mcc_lstm:.4f}')


Confusion Matrix:
[[22065     0]
 [ 6372     0]]
Precision: 0.0000
MCC: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
