## Initial Data Exploration

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import glob
import os

In [2]:
def load_data(data_dir='../data/raw/'):
    """Loads CSV files from the specified directory into a dictionary of DataFrames.
       Args: data_dir: Path to the directory containing CSV files. Defaults to '../data/raw/'.
       Returns:A dictionary where keys are file names (without extension) and values are corresponding DataFrames. """
    
    dfs = {}
    for file in glob.glob(os.path.join(data_dir, '*.csv')):
        filename = os.path.basename(file).split('_', 1)[0] 
        dfs[filename] = pd.read_csv(file)
    return dfs

def describe_data(dfs):
    """Prints descriptive statistics for each DataFrame in the input dictionary.
       Args: dataframes: A dictionary of DataFrames.
       Returns: None """

    for name, df in dfs.items():
        print(f"\n=== {name} ===")
        print(f"Shape: {df.shape}")
        print("\nMissing Values:")
        print(df.isnull().sum())
        print("\nSample Data:")
        print(df.head(3))

        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            print("\nNumeric Columns Statistics:")
            print(df[numeric_cols].describe())

    # Map files to features 
    df_dict = {
        'production': 'Crude Production',
        'inventory': 'Crude Inventories',
        'refinery': 'Refinery Capacity',
        'rigs': 'Rigs Count',
        'wti': 'WTI Prices',
        'currency': 'EURO-USD exchange_currency',
        'gdp': 'GDP',
        'inflation': 'Inflation rate'
    }

if __name__ == "__main__":
    data = load_data() 
    describe_data(data)


=== currency ===
Shape: (6504, 2)

Missing Values:
date     0
value    0
dtype: int64

Sample Data:
         date   value
0  1999-01-04  1.1812
1  1999-01-05  1.1760
2  1999-01-06  1.1636

Numeric Columns Statistics:
             value
count  6504.000000
mean      1.184996
std       0.155897
min       0.827000
25%       1.084800
50%       1.175100
75%       1.303600
max       1.601000

=== gdp ===
Shape: (311, 2)

Missing Values:
date     0
value    0
dtype: int64

Sample Data:
         date    value
0  1947-01-01  243.164
1  1947-04-01  245.968
2  1947-07-01  249.585

Numeric Columns Statistics:
              value
count    311.000000
mean    7379.822006
std     7685.677351
min      243.164000
25%      812.309500
50%     4444.094000
75%    12844.971000
max    29354.321000

=== inflation ===
Shape: (64, 2)

Missing Values:
date     0
value    0
dtype: int64

Sample Data:
         date     value
0  1960-01-01  1.457976
1  1961-01-01  1.070724
2  1962-01-01  1.198773

Numeric Columns St

### Comments

#### Crude Oil production data 

| # | Metric               | Time Frame    | Frequency | Records/Range                 | Average        | Notes                                     |
|---|----------------------|---------------|-----------|-------------------------------|----------------|-------------------------------------------|
| 1 | Crude Production     | 1920-present  | Monthly   | 1,097 - 13,361 units          | ~6,450 units   | No missing values                     |
| 2 | Crude inventory      | 1982-Recent        | Weekly    | 247,323 - 540,722 units       | ~345,600 units | Complete records                          |
| 3 | Refinery Capacity    | 1985-present  | Monthly   | 70% - 99.9% (477 rec)         | ~89%           | Clean data                                |
| 4 | Rigs Count           | 1973-present  | Monthly   | 250 - 4,521 rigs (621 rec)    | ~1,350 rigs    | High variability, no missing values       |
| 5 | WTI Prices           | 2005-Recent        | Daily     | -$37 - $145/barrel (5,000 rec)| ~$72/barrel    | Captures negative prices during COVID-19  |

#### Economical data

| # | Metric         | Time Frame    | Frequency | Records/Range               | Average        | Notes                                                                |
|---|----------------|---------------|-----------|-----------------------------|----------------|----------------------------------------------------------------------|
| 1 | EUR/USD ExRate | 1999-present  | Daily     | 0.83 - 1.60 (6,500+ rec)    | ~1.18          | Clean data, no missing values, stable long-term average              |
| 2 | GDP            | 1947-present  | Quarterly | 243 - 29,354 units          | N/A            | No missing values, consistent data quality, reflects economic growth |
| 3 | Inflation      | 1960-present  | Annual    | -0.36% - 13.55% (64 rec)    | N/A            | No quality issues, captures deflation/high-inflation periods         |