In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

# Load dataset
data = pd.read_csv('AirQuality.csv', delimiter=';', skipinitialspace=True)

# Data cleaning
data.columns = data.columns.str.strip()  # Remove extra spaces
data = data.replace('NaN', pd.NA)  # Treat 'NaN' strings as actual missing values

# Fill missing values with the column mean
data = data.fillna(data.mean())

# Convert Date and Time into datetime format
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], dayfirst=True)

# Select features and target
features = ['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'T', 'RH', 'AH']
target = 'NO2(GT)'  # Example: predicting NO2 levels

# Ensure the dataset has no missing values after imputation
data = data.dropna(subset=features + [target])

# Feature scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data[features + [target]])

# Split into features (X) and target (y)
X = scaled_data[:, :-1]  # All columns except the target
y = scaled_data[:, -1]  # Target column

# Reshape the data to 3D for LSTM (samples, timesteps, features)
sequence_length = 10
X_lstm = []
y_lstm = []

for i in range(sequence_length, len(X)):
    X_lstm.append(X[i-sequence_length:i])  # Sequence of features for each prediction
    y_lstm.append(y[i])  # The corresponding target

X_lstm, y_lstm = np.array(X_lstm), np.array(y_lstm)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model and capture training history
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
predictions = model.predict(X_test)

# Inverse scaling to get original values
predictions_rescaled = scaler.inverse_transform(np.concatenate((X_test[:, -1], predictions), axis=1))[:, -1]

# Inverse transform y_test for comparison
y_test_rescaled = scaler.inverse_transform(np.concatenate((X_test[:, -1], y_test.reshape(-1, 1)), axis=1))[:, -1]

# Evaluation: Mean Squared Error
mse = np.mean((predictions_rescaled - y_test_rescaled) ** 2)
print(f'Mean Squared Error: {mse}')

# Plot Training and Validation Loss over Epochs
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.show()

# Plot Predictions vs Actual Values
plt.figure(figsize=(10, 6))
plt.plot(y_test_rescaled, label='Actual Values')
plt.plot(predictions_rescaled, label='Predicted Values')
plt.title('Predictions vs Actual Values')
plt.ylabel(target)
plt.xlabel('Time Step')
plt.legend(loc='upper right')
plt.show()


  data = data.fillna(data.mean())


ParserError: Unknown string format: 10/03/2004 18.00.00