# EDA 
Getting the data organized into a master data sheet

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import xarray as xr
import numpy as np
import netCDF4

In [22]:
folder_path = 'dataset/Training_Anomalies_Station Data'

# Process each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        
        # Drop the longitude and latitude columns if they exist
        df = df.drop(columns=['longitude', 'latitude'], errors='ignore')
        
        # Save the updated DataFrame back to the same file (or modify as needed)
        df.to_csv(file_path, index=False)

In [51]:
folder_path = 'dataset/Training_Anomalies_Station Data'
all_data = []

# Process each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        df = df[['t', 'anomaly', 'location']]  # Ensure the required columns
        all_data.append(df)

# Concatenate all data into a single DataFrame
combined_df = pd.concat(all_data)

# Handle duplicates by aggregating using mean or another function
combined_df = combined_df.groupby(['t', 'location']).mean().reset_index()

# Pivot the DataFrame so each location is a column
result = combined_df.pivot(index='t', columns='location', values='anomaly')

# Reset the index name to "time"
result.index.name = 'Date'

result.to_csv('Station_Anomaly.csv')


Station_Anomaly = pd.read_csv('Station_Anomaly.csv')
Station_Anomaly.head()

Unnamed: 0,Date,Atlantic City,Baltimore,Eastport,Fort Pulaski,Lewes,New London,Newport,Portland,Sandy Hook,Sewells Point,The Battery,Washington
0,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1993-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1993-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1993-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1993-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
directory = "dataset/Copernicus_ENA_Satelite_Maps_Training_Data"
data = []

for filename in os.listdir(directory):
    if filename.endswith(".nc"):
        file_path = os.path.join(directory, filename)

        # Extract the date part from the filename and format it
        date_str = filename.split("_")[2]
        if len(date_str) == 8:
            formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"

            # Open the .nc file
            dataset = netCDF4.Dataset(file_path, mode="r")

            # Extract the 'sla' variable
            sla = dataset.variables["sla"][:][0]

            data.append({'Date': formatted_date, 'Map': sla})
            
Map_df = pd.DataFrame(data)
Map_df.head()

Unnamed: 0,Date,Map
0,1993-01-01,"[[--, --, --, --, --, --, --, --, 0.0713, 0.07..."
1,1993-01-02,"[[--, --, --, --, --, --, --, --, 0.0727, 0.07..."
2,1993-01-03,"[[--, --, --, --, --, --, --, --, 0.0743, 0.07..."
3,1993-01-04,"[[--, --, --, --, --, --, --, --, 0.0759000000..."
4,1993-01-05,"[[--, --, --, --, --, --, --, --, 0.0763, 0.07..."


In [54]:
Raw_Master_df = Station_Anomaly.merge(Map_df[['Date', 'Map']], on = 'Date')
print(Raw_Master_df.shape)
Raw_Master_df.head()

(7302, 14)


Unnamed: 0,Date,Atlantic City,Baltimore,Eastport,Fort Pulaski,Lewes,New London,Newport,Portland,Sandy Hook,Sewells Point,The Battery,Washington,Map
0,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[--, --, --, --, --, --, --, --, 0.0713, 0.07..."
1,1993-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[--, --, --, --, --, --, --, --, 0.0727, 0.07..."
2,1993-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[--, --, --, --, --, --, --, --, 0.0743, 0.07..."
3,1993-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[--, --, --, --, --, --, --, --, 0.0759000000..."
4,1993-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[--, --, --, --, --, --, --, --, 0.0763, 0.07..."


In [60]:
Atlantic_City = Raw_Master_df[['Date', 'Eastport', 'Map']]
Atlantic_City.dropna()
print(Atlantic_City.shape)
Atlantic_City.head()

(7302, 3)


Unnamed: 0,Date,Eastport,Map
0,1993-01-01,0.0,"[[--, --, --, --, --, --, --, --, 0.0713, 0.07..."
1,1993-01-02,0.0,"[[--, --, --, --, --, --, --, --, 0.0727, 0.07..."
2,1993-01-03,0.0,"[[--, --, --, --, --, --, --, --, 0.0743, 0.07..."
3,1993-01-04,0.0,"[[--, --, --, --, --, --, --, --, 0.0759000000..."
4,1993-01-05,0.0,"[[--, --, --, --, --, --, --, --, 0.0763, 0.07..."
