In [17]:
import pandas as pd
import numpy as np
import logging
import os
import pandas as pd
from torch.utils.data import Dataset
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [18]:
logging.basicConfig(level=logging.INFO)

In [19]:
class CustomDataLoader:
    def __init__(self, feature_path, files_path):
        self.feature_path = feature_path
        self.files_path = files_path

    def load_and_split_data(self):
        try:
            # Load and concatenate the CSV files
            df_list = [pd.read_csv(os.path.join(feature_path,file)) for file in file_paths]
            data = pd.concat(df_list, ignore_index=True)
            logging.info("Data loaded successfully.")
            return data
        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise

In [37]:
class DataPreprocessor:
    def __init__(self, data):
        self.data = data

    def preprocess(self):
        # Drop the column named 'g'
        if 'g' in self.data.columns:
            logging.info("Drop column 'g'.")
            self.data.drop('g', axis=1, inplace=True)
        
        # Drop the column named 'sst'
        if 'sst' in self.data.columns:
            logging.info("Drop column 'sst'.")
            self.data.drop('sst', axis=1, inplace=True)
        
        logging.info("Changing time to datetime format.")
        # if 'time' in self.data.columns:
        #     self.data['time'] = pd.to_datetime(self.data['time']).dt.date.astype('int64') // 10**9
        
        # Drop rows with any NaN values
        logging.info("Drop NaN rows.")
        self.data.dropna(inplace=True)
        
        # Convert the 'time' column to datetime
        self.data['time'] = pd.to_datetime(self.data['time']).dt.date

        # Group by 'time' and calculate the mean
        self.data = self.data.groupby(['longitude','latitude','time']).mean().reset_index()
        
        self.data = self.data.sort_values(by='time')
        
        logging.info("Data preprocessing completed.")
        return self.data
    
    def build_sequences(self):
        pass

In [21]:
class CustomDataset(Dataset):
    def __init__(self, features, targets):
        """
        Custom Dataset compatible with PyTorch DataLoader.
        :param features: Pandas DataFrame or NumPy array containing the features.
        :param targets: Pandas Series or NumPy array containing the targets.
        """
        
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.targets = torch.tensor(targets.values, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [50]:
class DataSplitter:
    def __init__(self, data, target_name='2t', test_size=0.2, val_size=0.1, batch_size=32):
        self.data = data
        self.target_name = target_name
        self.test_size = test_size
        self.val_size = val_size
        self.batch_size = batch_size

    def split_data(self):
        X = self.data.drop(self.target_name, axis=1)
        y = self.data[self.target_name]

        # Splitting the data
        X_train, X_testv = train_test_split(X, test_size=.4, shuffle=False)
        X_val, X_test = train_test_split(X_testv, test_size=.5, shuffle=False)
        
        X_train = X_train[~X_train.time.isin([max(np.unique(X_train.time))])]
        X_val = X_val[~X_val.time.isin([min(np.unique(X_val.time)), max(np.unique(X_val.time))])]
        X_test = X_test[~X_test.time.isin([max(np.unique(X_test.time))])]

        # Creating Dataset objects
        # train_dataset = CustomDataset(X_train, y_train)
        # val_dataset = CustomDataset(X_val, y_val)
        # test_dataset = CustomDataset(X_test, y_test)

        # Creating DataLoader objects
        # train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False)
        # val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
        # test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
        return X_train, X_val, X_test
    
    def sequence_maker(self, data, ts_p=5, ts_f=2, step=1):
        
        
        
        return

In [23]:
def log_combined_data_loader_info(train_loader, val_loader, test_loader):
    """
    Logs combined information about the Train, Validation, and Test DataLoaders in a tabular format,
    including the percentage of each dataset.
    :param train_loader: PyTorch DataLoader for the training set.
    :param val_loader: PyTorch DataLoader for the validation set.
    :param test_loader: PyTorch DataLoader for the test set.
    """
    # Gathering information
    total_samples_each = [len(train_loader.dataset), len(val_loader.dataset), len(test_loader.dataset)]
    total_samples_all = sum(total_samples_each)
    percentages = [(samples / total_samples_all) * 100 for samples in total_samples_each]

    info = {
        "Set": ["Train", "Validation", "Test"],
        "Total Batches": [len(train_loader), len(val_loader), len(test_loader)],
        "Batch Size": [train_loader.batch_size, val_loader.batch_size, test_loader.batch_size],
        "Total Samples": total_samples_each,
        "Percentage": percentages
    }

    # Creating a formatted string to represent the table
    table_header = f"{'Set':<12}{'Total Batches':<15}{'Batch Size':<12}{'Total Samples':<15}{'Percentage':<10}"
    table_rows = [f"{set_name:<12}{batches:<15}{batch_size:<12}{samples:<15}{percent:.2f}%" 
                  for set_name, batches, batch_size, samples, percent in zip(info["Set"], info["Total Batches"], info["Batch Size"], info["Total Samples"], info["Percentage"])]

    table = "\n".join([table_header] + table_rows)

    # Logging the table
    logging.info("\n" + table)

In [24]:
class ModelBuilder:
    def __init__(self):
        pass
    def hybrid_conv_lstm(self):
        pass
    def SAVP(self):
        pass
    def Attention(self):
        pass
    def informer(self):
        pass

In [25]:
class PredictionVisualizer:
    def __init__(self, model, data):
        self.model = model
        self.data = data

    def visualize_future_forecast(self, future_data):
        predictions = self.model.predict(future_data)
        plt.figure(figsize=(10, 6))
        plt.plot(predictions)
        plt.title("Future Temperature Forecast")
        plt.xlabel("Time")
        plt.ylabel("Temperature")
        plt.savefig("future_forecast.png")
        plt.show()

    def compare_actual_vs_predicted(self, test_data):
        actual = test_data['target']
        predicted = self.model.predict(test_data.drop('target', axis=1))
        plt.figure(figsize=(10, 6))
        plt.plot(actual, label='Actual')
        plt.plot(predicted, label='Predicted')
        plt.title("Comparison of Actual vs Predicted Temperatures")
        plt.xlabel("Time")
        plt.ylabel("Temperature")
        plt.legend()
        plt.savefig("actual_vs_predicted.png")
        plt.show()

In [26]:
class ReportGenerator:
    def __init__(self, actual, predicted):
        self.actual = actual
        self.predicted = predicted

    def generate_report(self):
        report_df = pd.DataFrame({
            'Actual': self.actual,
            'Predicted': self.predicted.squeeze(),
            'Difference': self.actual - self.predicted.squeeze()
        })
        report_df['Error'] = report_df['Difference'].abs()
        report_df['Squared Error'] = report_df['Error'] ** 2

        # Additional statistics
        report_df['Mean Actual'] = self.actual.mean()
        report_df['Mean Predicted'] = self.predicted.mean()
        report_df['Standard Deviation Actual'] = self.actual.std()
        report_df['Standard Deviation Predicted'] = self.predicted.std()

        report_df.to_csv('report.csv', index=False)
        logging.info("Report generated and saved as report.csv.")

In [27]:
########################################################################
#wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww#
########################################################################

In [28]:
feature_path = 'features_selection/data'
file_paths = ['features_0_19402023.csv', 'features_1_19402023.csv', 
              'features_2_19402023.csv', 'features_3_19402023.csv',
              'features_4_19402023.csv']

# Initialize DataLoader and split data
data_loader = CustomDataLoader(feature_path,file_paths)
data = data_loader.load_and_split_data()

INFO:root:Data loaded successfully.


In [38]:
# Preprocess data
preprocessor = DataPreprocessor(data)
data = preprocessor.preprocess()

INFO:root:Changing time to datetime format.
INFO:root:Drop NaN rows.
INFO:root:Data preprocessing completed.


In [39]:
data

Unnamed: 0,longitude,latitude,time,10u,10v,100u,100v,msl,swvl1,geo500,2t
0,125.00,34.00,1940-01-01,0.958458,-4.724854,1.056137,-4.963158,102534.2825,0.000002,54348.3280,4.846588
1303806,128.75,35.00,1940-01-01,-1.111855,-2.787842,-1.359879,-2.984154,102310.1575,0.009356,54162.0780,3.586822
1011834,127.75,37.50,1940-01-01,0.568809,0.364502,1.010239,0.389870,102475.0325,0.408617,53828.4530,-5.895599
785304,127.25,34.75,1940-01-01,1.417442,-2.135498,2.296860,-3.574974,102433.6575,0.255899,54229.3280,-1.397552
558774,126.50,36.25,1940-01-01,0.546349,-2.204346,0.964829,-2.806419,102515.4075,0.167559,54060.9530,-0.891693
...,...,...,...,...,...,...,...,...,...,...,...
1409519,129.00,35.75,2023-11-19,2.160927,0.891113,4.760422,1.317146,101808.1875,0.328074,55056.3665,6.446319
382583,126.00,35.75,2023-11-19,5.438270,-2.354004,6.004074,-2.719963,101930.0625,0.000002,55247.3665,12.827179
1414553,129.00,36.00,2023-11-19,2.747841,0.166504,5.965500,0.071053,101785.8125,0.334719,54967.3665,7.259796
1384349,129.00,34.50,2023-11-19,9.151161,-1.266602,10.230636,-1.435783,101907.3125,0.000002,55382.2415,13.353546


In [51]:
splitter = DataSplitter(data, batch_size=32)
train, validation, test = splitter.split_data()
train
#log_combined_data_loader_info(train, validation, test)

Unnamed: 0,longitude,latitude,time,10u,10v,100u,100v,msl,swvl1,geo500
0,125.00,34.00,1940-01-01,0.958458,-4.724854,1.056137,-4.963158,102534.2825,0.000002,54348.3280
1303806,128.75,35.00,1940-01-01,-1.111855,-2.787842,-1.359879,-2.984154,102310.1575,0.009356,54162.0780
1011834,127.75,37.50,1940-01-01,0.568809,0.364502,1.010239,0.389870,102475.0325,0.408617,53828.4530
785304,127.25,34.75,1940-01-01,1.417442,-2.135498,2.296860,-3.574974,102433.6575,0.255899,54229.3280
558774,126.50,36.25,1940-01-01,0.546349,-2.204346,0.964829,-2.806419,102515.4075,0.167559,54060.9530
...,...,...,...,...,...,...,...,...,...,...
924241,127.50,37.25,1990-04-25,1.943413,3.971336,3.450066,6.411087,101481.5300,0.443538,54246.0625
1694443,129.75,37.25,1990-04-25,4.099663,8.636864,5.260124,10.263626,101328.6550,-0.000004,54350.5625
657439,126.75,36.75,1990-04-25,2.404839,5.795555,4.023796,9.251907,101487.5300,0.439220,54382.0625
1135669,128.25,35.00,1990-04-25,2.674370,2.579735,4.304069,4.209915,101803.7800,0.374164,55000.6875


In [32]:
# Build and train the model
builder = ModelBuilder()


# Visualization and report generation (optional)
# These steps would require additional data or modifications
# depending on your specific use case and available data

# Example usage (modify as needed):
# visualizer = PredictionVisualizer(model, processed_data)
# future_data = ... # Load or create your future data for prediction
# visualizer.visualize_future_forecast(future_data)
# test_data = ... # Subset of processed_data or separate test data
# visualizer.compare_actual_vs_predicted(test_data)

# Generate report (modify as needed):
# actual_values = ... # Actual values from your dataset
# predicted_values = model.predict(...) # Predictions from your model
# report_generator = ReportGenerator(actual_values, predicted_values)
# report_generator.generate_report()