In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [2]:
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
data = pd.read_excel('/content/IndiaDataset.xlsx')

In [4]:
print(data.head())

          From Date           To Date  Ozone    CO    SO2   NO2   PM10  PM2.5  \
0  01-08-2021 00:00  02-08-2021 00:00  21.85  0.35  10.97  6.65  20.35   5.46   
1  02-08-2021 00:00  03-08-2021 00:00  23.57  0.39  11.88  7.20  24.83   7.07   
2  03-08-2021 00:00  04-08-2021 00:00  22.40  0.35  11.86  8.41  28.16   6.96   
3  04-08-2021 00:00  05-08-2021 00:00  20.71  0.41  11.49  7.60  20.86   5.09   
4  05-08-2021 00:00  06-08-2021 00:00  20.84  0.31  15.08  7.09  26.44   5.91   

            State       City                         Station  
0  Andhra Pradesh  Amaravati  Secretariat, Amaravati - APPCB  
1  Andhra Pradesh  Amaravati  Secretariat, Amaravati - APPCB  
2  Andhra Pradesh  Amaravati  Secretariat, Amaravati - APPCB  
3  Andhra Pradesh  Amaravati  Secretariat, Amaravati - APPCB  
4  Andhra Pradesh  Amaravati  Secretariat, Amaravati - APPCB  


In [5]:
data = data.drop(['City', 'Station', 'To Date'], axis=1)

In [6]:
data['From Date'] = pd.to_datetime(data['From Date'],format='%d-%m-%Y %H:%M')

In [7]:
data.set_index('From Date', inplace=True)

In [8]:
print("\nMissing values in each column:")
print(data.isnull().sum())

# Fill missing values with appropriate method (forward fill in this case)
data = data.fillna(method='ffill')


Missing values in each column:
Ozone    913
CO       989
SO2      367
NO2       99
PM10     128
PM2.5    963
State      0
dtype: int64


  data = data.fillna(method='ffill')


In [9]:
print(data.describe())

               Ozone             CO            NO2           PM10  \
count  176091.000000  176091.000000  176091.000000  176091.000000   
mean       29.637705       0.895521      26.074840     126.715266   
std        21.098632       0.643234      24.258886      97.488740   
min         0.010000       0.000000       0.010000       0.170000   
25%        14.520000       0.480000      10.680000      56.260000   
50%        24.700000       0.740000      19.530000      98.120000   
75%        40.090000       1.130000      33.910000     168.470000   
max       196.950000      11.400000     494.700000    1000.000000   

               PM2.5  
count  176091.000000  
mean       58.384106  
std        54.382946  
min         0.050000  
25%        23.460000  
50%        41.510000  
75%        73.680000  
max       833.800000  


In [10]:
plt.figure(figsize=(15, 10))
for i, column in enumerate(data.columns):
    plt.subplot(len(data.columns), 1, i+1)
    # Convert the column data to numeric, handling errors
    try:
        numeric_data = pd.to_numeric(data[column], errors='coerce')
    except (TypeError, ValueError):
        print(f"Skipping column '{column}' due to non-numeric data.")
        continue  # Skip to the next column
    plt.plot(data.index, numeric_data)
    plt.title(column)
    plt.tight_layout()
plt.savefig('time_series_plot.png')
plt.close()

In [11]:
# Convert 'From Date' to numerical representation before scaling
# This is necessary if 'From Date' is still in the DataFrame
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Assuming 'data' is your DataFrame
# 'From Date' is now the index, access it using data.index
data.index = pd.to_datetime(data.index)  # Ensure 'From Date' (index) is datetime
data['From Date_Timestamp'] = data.index.astype(np.int64) // 10**9  # Convert to Unix timestamp and store in a new column

# Alternatively, drop or exclude non-numeric columns
# Assuming you want to scale only the numeric columns:
numeric_data = data.select_dtypes(include=np.number)  # Select only numeric columns

# Now scale the numeric data:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numeric_data)

In [12]:
look_back = 24  # Number of previous time steps to use as input variables

X = []
y = []
for i in range(look_back, len(scaled_data)):
    X.append(scaled_data[i-look_back:i, :])
    y.append(scaled_data[i, :])  # Predict all features

X = np.array(X)
y = np.array(y)

In [13]:
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"\nTraining data shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing data shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training data shape: X_train=(140853, 24, 6), y_train=(140853, 6)
Testing data shape: X_test=(35214, 24, 6), y_test=(35214, 6)


In [14]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(look_back, X.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(X.shape[2]))  # Output layer with the same number of features

# Compile the model
model.compile(optimizer='adam', loss='mse')

  super().__init__(**kwargs)


In [15]:
print("\nModel summary:")
model.summary()


Model summary:


In [16]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)

Epoch 1/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 25ms/step - loss: 0.0043 - val_loss: 0.0012
Epoch 2/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 25ms/step - loss: 0.0018 - val_loss: 0.0012
Epoch 3/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 25ms/step - loss: 0.0017 - val_loss: 0.0011
Epoch 4/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 25ms/step - loss: 0.0017 - val_loss: 0.0012
Epoch 5/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 25ms/step - loss: 0.0017 - val_loss: 0.0011
Epoch 6/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 25ms/step - loss: 0.0017 - val_loss: 0.0011
Epoch 7/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 25ms/step - loss: 0.0017 - val_loss: 0.0011
Epoch 8/20
[1m4402/4402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 25ms/step - loss: 0.0017 - val_loss: 0.0011


In [17]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('training_history.png')
plt.close()

In [18]:
y_pred = model.predict(X_test)

[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step


In [19]:
pred_inverse = np.zeros((len(y_pred), X.shape[2]))
test_inverse = np.zeros((len(y_test), X.shape[2]))

In [20]:
pred_inverse[:, :] = y_pred
test_inverse[:, :] = y_test

In [21]:
pred_inverse = scaler.inverse_transform(pred_inverse)
test_inverse = scaler.inverse_transform(test_inverse)

In [22]:
for i in range(pred_inverse.shape[1]): # Iterate over the columns of pred_inverse
    rmse = np.sqrt(mean_squared_error(test_inverse[:, i], pred_inverse[:, i]))
    r2 = r2_score(test_inverse[:, i], pred_inverse[:, i])
    print(f"{data.columns[i]} - RMSE: {rmse:.2f}, R²: {r2:.2f}") # Access column name from data.columns using the loop index i

Ozone - RMSE: 9.14, R²: 0.77
CO - RMSE: 0.32, R²: 0.71
SO2 - RMSE: 9.75, R²: 0.81
NO2 - RMSE: 37.24, R²: 0.81
PM10 - RMSE: 20.29, R²: 0.81
PM2.5 - RMSE: 2365551.87, R²: 0.98


In [23]:
plt.figure(figsize=(15, 10))
for i in range(pred_inverse.shape[1]):  # Iterate over the columns of pred_inverse (which should match test_inverse)
    plt.subplot(pred_inverse.shape[1], 1, i + 1)
    plt.plot(test_inverse[:, i], label='Actual')
    plt.plot(pred_inverse[:, i], label='Predicted')
    # Assuming data.columns has the correct column names, use i to index it
    plt.title(f'{data.columns[i]} - Actual vs Predicted')
    plt.legend()
    plt.tight_layout()
plt.savefig('prediction_results.png')
plt.close()

In [24]:
last_sequence = X_test[-1].reshape(1, look_back, X.shape[2])

In [25]:
next_pred = model.predict(last_sequence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step


In [26]:
next_pred_inverse = scaler.inverse_transform(next_pred)

In [45]:
number_of_future_steps = 5  # Example: Predict for 10 steps ahead

forecasted_values = []
current_sequence = last_sequence  # Start with the last sequence from testing data

for _ in range(number_of_future_steps):
    next_pred = model.predict(current_sequence)
    forecasted_values.append(next_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


In [46]:
print("\nForecasted values for the next time step:")
# Get the columns that were actually scaled (numeric columns)
scaled_columns = data.select_dtypes(include=np.number).columns

# Iterate through the scaled columns and their corresponding predictions
for i, column in enumerate(scaled_columns):
    print(f"{column}: {next_pred_inverse[0, i]:.2f}")


Forecasted values for the next time step:
Ozone: 27.50
CO: 0.36
NO2: 14.56
PM10: 35.13
PM2.5: 13.50
From Date_Timestamp: 1689564544.00


In [None]:
model.save('lstm_aqi_model.h5')
print("\nModel saved as 'lstm_aqi_model.h5'")