# LINK TO GITHUB: [Ran+Shany Repo](https://github.com/RyanWri/Afeka_DL_course_labs/tree/main/src/task_2)

**All our code is organized, you can find task2 in src/task_2 directory**

In [1]:
import asyncio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import nest_asyncio

# Allow nested use of asyncio.run()
nest_asyncio.apply()

# 1 Use the dataset from UCI Machine Learning Repository
**We read the data and split it into 10 different chunks for faster reading, data files located in src/task_2/data directory**
<br>*Asyncio is an asynchornous library for fast processing time*

# 2. Perform Exploratory Data Analysis (EDA) of the dataset

In [2]:
from eda.eda import load_and_process_chunk

In [3]:
async def process_data_parallel():
    chunk_files = [
        "C:/Afeka/Afeka_DL_course_labs/src/task_2/data/household_power_consumption_0.csv",
        "C:/Afeka/Afeka_DL_course_labs/src/task_2/data/household_power_consumption_207526.csv",
    ]  # Add paths to all chunk files
    tasks = [load_and_process_chunk(file) for file in chunk_files]
    results = await asyncio.gather(*tasks)

    # Concatenate all chunks into a single DataFrame
    full_df = pd.concat(results)
    return full_df

# Run the asynchronous processing
full_df = asyncio.run(process_data_parallel())

**Plot basic stats on data**

In [None]:
def plot_dataframe_stats(df: pd.DataFrame):
    # Display basic info about the DataFrame
    print(df.info())
    print(df.head())

plot_dataframe_stats(full_df)

## 2.1 Visualize Time Series Trends

In [None]:
def visualize_time_series_trends(df: pd.DataFrame):
    # Plot Global_active_power over time
    plt.figure(figsize=(12, 6))
    plt.plot(df["Global_active_power"], label="Global Active Power")
    plt.xlabel("Time")
    plt.ylabel("Global Active Power (kilowatts)")
    plt.title("Global Active Power over Time")
    plt.legend()
    plt.show()


visualize_time_series_trends(full_df)

## 2.2 Check for Seasonality and Cyclical Patterns

In [None]:
def check_seasonality_and_cyclical_patterns(df: pd.DataFrame):
    # Decompose the time series
    decomposition = seasonal_decompose(
        df["Global_active_power"].dropna(), model="additive", period=24 * 60
    )

    # Plot decomposition results
    plt.figure(figsize=(12, 8))
    plt.subplot(411)
    plt.plot(decomposition.observed, label="Observed")
    plt.legend(loc="upper right")
    plt.subplot(412)
    plt.plot(decomposition.trend, label="Trend")
    plt.legend(loc="upper right")
    plt.subplot(413)
    plt.plot(decomposition.seasonal, label="Seasonal")
    plt.legend(loc="upper right")
    plt.subplot(414)
    plt.plot(decomposition.resid, label="Residual")
    plt.legend(loc="upper right")
    plt.show()


check_seasonality_and_cyclical_patterns(full_df)

## 2.3 Analyze Distribution of Power Consumption

In [None]:
def analyze_distribution_of_power_consumption(df: pd.DataFrame):
    # Plot histogram
    plt.figure(figsize=(12, 6))
    df["Global_active_power"].hist(bins=50)
    plt.xlabel("Global Active Power (kilowatts)")
    plt.ylabel("Frequency")
    plt.title("Distribution of Global Active Power")
    plt.show()

    # Plot boxplot
    plt.figure(figsize=(12, 6))
    df.boxplot(column="Global_active_power")
    plt.ylabel("Global Active Power (kilowatts)")
    plt.title("Boxplot of Global Active Power")
    plt.show()

analyze_distribution_of_power_consumption(full_df)

# 3. Implement a linear regression model to predict power consumption for the last three time periods

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from models.linear_regression import split_data_append_lagged_features, run_linear_regression
from evaluation.model_evaluation import run_model_evaluation

In [None]:
X_train, X_test, y_train, y_test = split_data_append_lagged_features(full_df)
linear_reg = run_linear_regression(X_train, y_train, X_test, y_test)

# Make predictions on the testing data
y_pred_lr = linear_reg.predict(X_test)

# Evaluate the model
lr_results = run_model_evaluation(y_test, y_pred_lr)

# 4. Evaluate the linear regression model using appropriate metrics

In [None]:
import json
print(json.dumps(lr_results, indent=1))

# 5. Implement a Recurrent Neural Network (RNN) for power consumption prediction

In [None]:
from models.rnn import build_rnn_model
from preprocessing.rnn import preprocess as rnn_preprocess
from preprocessing.rnn import train_test_split_sequence as rnn_split
from evaluation.model_evaluation import run_model_evaluation
import keras

In [None]:
sequence_length = 60
X, y = rnn_preprocess(full_df, sequence_length)

In [None]:
X_train, X_test, y_train, y_test = rnn_split(X, y)
rnn_model = build_rnn_model(sequence_length, X_train)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
history = rnn_model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
)

In [None]:
# Evaluate the model
loss = rnn_model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")
# Make predictions
predictions = rnn_model.predict(X_test)

In [None]:
rnn_results = run_model_evaluation(y_test, predictions)
print(json.dumps(rnn_results, indent=1))

# 6. Implement Long Short-Term Memory (LSTM) for power consumption prediction

In [None]:
from models.long_short_term_memory import run_lstm_model_e2e

In [None]:
run_lstm_model_e2e(full_df)

# 7. Implement an LSTM model with an Attention layer for power consumption prediction

In [None]:
from models.lstm_with_attention import run_lstm_with_attention_e2e

In [None]:
run_lstm_with_attention_e2e(full_df)

# 8. Data augmentation experiment

In [None]:
from task_2.data_modification.augmentation import add_noise, scale_data, shift_data, window_slicing

In [None]:
X_combined = np.concatenate([X, add_noise(X), scale_data(X), shift_data(X), window_slicing(X)], axis=0)
y_combined = np.concatenate([y, y, y, y, y], axis=0)

In [None]:
X_train_augment, X_test_augment, y_train_augment, y_test_augment = rnn_split(X_combined, y_combined)

# 9. Data reduction experiment

In [None]:
from task_2.data_modification.augmentation import reduce_data_randomly

In [None]:
# Apply reduction
X_reduced, y_reduced = reduce_data_randomly(X, y, reduction_factor=0.1)

# 10. Data resolution experiment

In [None]:
from data_modification.resolution import resample_data_from_1min_to_2min

In [None]:
resampled_df = resample_data_from_1min_to_2min(full_df)

# 11. Conclusion and insights