In [2]:
import pandas as pd
import numpy as np
import logging
import os
import pandas as pd
from torch.utils.data import Dataset
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
class CustomDataLoader:
    def __init__(self, feature_path, files_path):
        self.feature_path = feature_path
        self.files_path = files_path

    def load_and_split_data(self):
        try:
            # Load and concatenate the CSV files
            df_list = [pd.read_csv(os.path.join(feature_path,file)) for file in file_paths]
            data = pd.concat(df_list, ignore_index=True)
            logging.info("Data loaded successfully.")
            return data
        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise

In [5]:
class DataPreprocessor:
    def __init__(self, data):
        self.data = data

    def preprocess(self):
        # Drop the column named 'g'
        if 'g' in self.data.columns:
            logging.info("Drop column 'g'.")
            self.data.drop('g', axis=1, inplace=True)
        
        # Drop the column named 'sst'
        if 'sst' in self.data.columns:
            logging.info("Drop column 'sst'.")
            self.data.drop('sst', axis=1, inplace=True)
        
        logging.info("Changing time to datetime format.")
        if 'time' in self.data.columns:
            self.data['time'] = pd.to_datetime(self.data['time']).astype('int64') // 10**9
        
        # Drop rows with any NaN values
        logging.info("Drop NaN rows.")
        self.data.dropna(inplace=True)
        logging.info("Data preprocessing completed.")
        return self.data

In [6]:
class CustomDataset(Dataset):
    def __init__(self, features, targets):
        """
        Custom Dataset compatible with PyTorch DataLoader.
        :param features: Pandas DataFrame or NumPy array containing the features.
        :param targets: Pandas Series or NumPy array containing the targets.
        """
        
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.targets = torch.tensor(targets.values, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [7]:
class DataSplitter:
    def __init__(self, data, target_name='2t', test_size=0.2, val_size=0.1, batch_size=32):
        self.data = data
        self.target_name = target_name
        self.test_size = test_size
        self.val_size = val_size
        self.batch_size = batch_size

    def split_data(self):
        X = self.data.drop(self.target_name, axis=1)
        y = self.data[self.target_name]

        # Splitting the data
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
        val_size_adjusted = self.val_size / (1 - self.test_size)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_size_adjusted, random_state=42)

        # Creating Dataset objects
        train_dataset = CustomDataset(X_train, y_train)
        val_dataset = CustomDataset(X_val, y_val)
        test_dataset = CustomDataset(X_test, y_test)

        # Creating DataLoader objects
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)

        return train_loader, val_loader, test_loader

In [None]:
def log_combined_data_loader_info(train_loader, val_loader, test_loader):
    """
    Logs combined information about the Train, Validation, and Test DataLoaders in a tabular format,
    including the percentage of each dataset.
    :param train_loader: PyTorch DataLoader for the training set.
    :param val_loader: PyTorch DataLoader for the validation set.
    :param test_loader: PyTorch DataLoader for the test set.
    """
    # Gathering information
    total_samples_each = [len(train_loader.dataset), len(val_loader.dataset), len(test_loader.dataset)]
    total_samples_all = sum(total_samples_each)
    percentages = [(samples / total_samples_all) * 100 for samples in total_samples_each]

    info = {
        "Set": ["Train", "Validation", "Test"],
        "Total Batches": [len(train_loader), len(val_loader), len(test_loader)],
        "Batch Size": [train_loader.batch_size, val_loader.batch_size, test_loader.batch_size],
        "Total Samples": total_samples_each,
        "Percentage": percentages
    }

    # Creating a formatted string to represent the table
    table_header = f"{'Set':<12}{'Total Batches':<15}{'Batch Size':<12}{'Total Samples':<15}{'Percentage':<10}"
    table_rows = [f"{set_name:<12}{batches:<15}{batch_size:<12}{samples:<15}{percent:.2f}%" 
                  for set_name, batches, batch_size, samples, percent in zip(info["Set"], info["Total Batches"], info["Batch Size"], info["Total Samples"], info["Percentage"])]

    table = "\n".join([table_header] + table_rows)

    # Logging the table
    logging.info("\n" + table)

In [8]:
class PyTorchModel(nn.Module):
    def __init__(self, input_size):
        super(PyTorchModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [9]:
class ModelBuilder:
    def build_model(self, input_size):
        model = PyTorchModel(input_size)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        logging.info(f"PyTorch model built successfully. Optimizer: {optimizer}, Loss: {criterion}")
        return model, optimizer, criterion

In [10]:
class Trainer:
    def __init__(self, model, optimizer, criterion, train_csv, test_csv, batch_size=32, epochs=10):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.epochs = epochs
        self.batch_size = batch_size

        # Load datasets
        self.train_data = CustomWeatherDataset(csv_file=train_csv)
        self.test_data = CustomWeatherDataset(csv_file=test_csv)

        # Create data loaders
        self.train_loader = DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)
        self.test_loader = DataLoader(self.test_data, batch_size=self.batch_size)

        logging.info("Data loaders created successfully.")

    def train(self):
        self.model.train()
        for epoch in range(self.epochs):
            running_loss = 0.0
            for features, labels in self.train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(features)
                loss = self.criterion(outputs.squeeze(), labels)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
            average_loss = running_loss / len(self.train_loader)
            logging.info(f"Epoch {epoch+1}/{self.epochs}, Loss: {average_loss:.4f}")

    def evaluate(self):
        self.model.eval()
        total_loss = 0.0
        with torch.no_grad():
            for features, labels in self.test_loader:
                outputs = self.model(features)
                loss = self.criterion(outputs.squeeze(), labels)
                total_loss += loss.item()
        average_loss = total_loss / len(self.test_loader)
        logging.info(f"Test Loss: {average_loss:.4f}")

    def visualize_batch(self):
        train_features, train_labels = next(iter(self.train_loader))
        logging.info(f"Feature batch shape: {train_features.size()}")
        logging.info(f"Labels batch shape: {train_labels.size()}")
        example_features = train_features[0]
        example_label = train_labels[0]
        plt.plot(example_features.numpy(), label='Features')
        plt.title(f'Example Weather Data Features with Label: {example_label.item()}')
        plt.xlabel('Feature Index')
        plt.ylabel('Feature Value')
        plt.legend()
        plt.show()

In [11]:
class PredictionVisualizer:
    def __init__(self, model, data):
        self.model = model
        self.data = data

    def visualize_future_forecast(self, future_data):
        predictions = self.model.predict(future_data)
        plt.figure(figsize=(10, 6))
        plt.plot(predictions)
        plt.title("Future Temperature Forecast")
        plt.xlabel("Time")
        plt.ylabel("Temperature")
        plt.savefig("future_forecast.png")
        plt.show()

    def compare_actual_vs_predicted(self, test_data):
        actual = test_data['target']
        predicted = self.model.predict(test_data.drop('target', axis=1))
        plt.figure(figsize=(10, 6))
        plt.plot(actual, label='Actual')
        plt.plot(predicted, label='Predicted')
        plt.title("Comparison of Actual vs Predicted Temperatures")
        plt.xlabel("Time")
        plt.ylabel("Temperature")
        plt.legend()
        plt.savefig("actual_vs_predicted.png")
        plt.show()

In [12]:
class ReportGenerator:
    def __init__(self, actual, predicted):
        self.actual = actual
        self.predicted = predicted

    def generate_report(self):
        report_df = pd.DataFrame({
            'Actual': self.actual,
            'Predicted': self.predicted.squeeze(),
            'Difference': self.actual - self.predicted.squeeze()
        })
        report_df['Error'] = report_df['Difference'].abs()
        report_df['Squared Error'] = report_df['Error'] ** 2

        # Additional statistics
        report_df['Mean Actual'] = self.actual.mean()
        report_df['Mean Predicted'] = self.predicted.mean()
        report_df['Standard Deviation Actual'] = self.actual.std()
        report_df['Standard Deviation Predicted'] = self.predicted.std()

        report_df.to_csv('report.csv', index=False)
        logging.info("Report generated and saved as report.csv.")

In [13]:
########################################################################
#wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww#
########################################################################

In [14]:
feature_path = 'features_selection/data'
file_paths = ['features_0_19402023.csv', 'features_1_19402023.csv', 
              'features_2_19402023.csv', 'features_3_19402023.csv',
              'features_4_19402023.csv']

# Initialize DataLoader and split data
data_loader = CustomDataLoader(feature_path,file_paths)
data = data_loader.load_and_split_data()

INFO:root:Data loaded successfully.


In [15]:
# Preprocess data
preprocessor = DataPreprocessor(data)
data = preprocessor.preprocess()

INFO:root:Drop column 'g'.
INFO:root:Drop column 'sst'.
INFO:root:Changing time to datetime format.
INFO:root:Drop NaN rows.
INFO:root:Data preprocessing completed.


In [16]:
data

Unnamed: 0,time,latitude,longitude,10u,10v,100u,100v,msl,swvl1,geo500,2t
0,-946771200,38.0,125.00,0.419251,-6.994049,0.539322,-7.767197,102701.690,0.000002,54024.832,-2.504486
1,-946771200,38.0,125.25,0.267883,-5.567291,0.401627,-6.972275,102700.440,0.182467,54007.832,-4.463470
2,-946771200,38.0,125.50,0.320618,-4.622955,0.482681,-6.185165,102706.690,0.229876,53989.582,-6.065033
3,-946771200,38.0,125.75,0.107727,-4.240143,0.275650,-5.673447,102706.190,0.208132,53971.082,-6.861908
4,-946771200,38.0,126.00,0.156555,-3.693268,0.290298,-5.080673,102703.690,0.293429,53953.582,-7.449799
...,...,...,...,...,...,...,...,...,...,...,...
3594271,1700395200,34.0,129.00,10.564636,-3.080109,11.930634,-3.466751,101845.875,0.000005,55508.690,16.389801
3594272,1700395200,34.0,129.25,10.969910,-2.972687,12.307587,-3.304642,101832.625,0.000005,55488.190,16.299957
3594273,1700395200,34.0,129.50,10.767761,-2.702179,12.258759,-3.010696,101818.625,0.000005,55467.940,16.333160
3594274,1700395200,34.0,129.75,11.442566,-2.366242,12.838837,-2.653275,101801.625,0.000005,55448.690,16.514801


In [25]:
splitter = DataSplitter(data, batch_size=32)
train, validation, test = splitter.split_data()
log_combined_data_loader_info(train, validation, test)

INFO:root:
Set         Total Batches  Batch Size  Total Samples  Percentage
Train       78625          32          2515992        70.00%
Validation  11233          32          359428         10.00%
Test        22465          32          718856         20.00%


In [None]:
# Build and train the model
builder = ModelBuilder()
model, optimizer, criterion = builder.build_model(input_size)
trainer = Trainer(model, optimizer, criterion, train_data, val_data, test_data, batch_size=32, epochs=10)
trainer.train()
trainer.evaluate()

# Visualization and report generation (optional)
# These steps would require additional data or modifications
# depending on your specific use case and available data

# Example usage (modify as needed):
# visualizer = PredictionVisualizer(model, processed_data)
# future_data = ... # Load or create your future data for prediction
# visualizer.visualize_future_forecast(future_data)
# test_data = ... # Subset of processed_data or separate test data
# visualizer.compare_actual_vs_predicted(test_data)

# Generate report (modify as needed):
# actual_values = ... # Actual values from your dataset
# predicted_values = model.predict(...) # Predictions from your model
# report_generator = ReportGenerator(actual_values, predicted_values)
# report_generator.generate_report()