## Data Preprocessing and Integration

In [60]:
import pandas as pd

Anomaly detection 

In [61]:
def validate_data(df, df_name):
    """Validate and clean data, keeping first of duplicate pairs"""
    print(f"\nValidating {df_name}...")
    
    # Define duplicate criteria
    if df_name == "Ghana data":
        dup_cols = ['station', 'name', 'date']
    elif df_name == "ICCO data":
        dup_cols = ['date', 'price_usd_per_tonne']
    else:
        dup_cols = ['date']
    
    # Find duplicates
    duplicates = df[df.duplicated(dup_cols, keep=False)]
    
    if not duplicates.empty:
        print("Duplicate pairs:", duplicates.sort_values(dup_cols))
        
        # Keep first of each pair
        df = df.drop_duplicates(dup_cols, keep='first')
        print(f"Now duplicated rows: {len(df[df.duplicated(dup_cols)])}")
        # print(f"Num of rows with 2023-12-15 and price 4272.15: {len(df[(df['date'] == '2023-12-15') & (df['price_usd_per_tonne' == 4272.15)])}")
        # print(f"Num of rows with 2024-01-09 and price 4171.24: {len(df[(df['date'] == '2024-01-09') & (df['price_usd_per_tonne'] == 4171.24)])}")

In [62]:
def check_anomalies(df):
    """Check for data anomalies"""
    print("\nRunning Data Quality Checks:")
    
    # Temperature logical consistency
    try:
        assert (df['max_temp'] >= df['avg_temp']).all(), "Max temp < Avg temp"
        assert (df['avg_temp'] >= df['min_temp']).all(), "Avg temp < Min temp"
        print("✓ Temperature consistency checks passed")
    except AssertionError as e:
        print(f"Temperature anomaly: {e}")
    
    # Precipitation non-negative
    try:
        assert (df['precipitation'] >= 0).all(), "Negative precipitation"
        print("✓ Precipitation non-negativity check passed")
    except AssertionError as e:
        print(f"Precipitation anomaly: {e}")
    
    # Price positive
    try:
        assert (df['price_usd_per_tonne'] > 0).all(), "Non-positive prices"
        print("✓ Price positivity check passed")
    except AssertionError as e:
        print(f"Price anomaly: {e}")

Load and clean Ghana weather data

In [63]:
ghana = pd.read_csv('Ghana_data.csv')
ghana = ghana.rename(columns={
    'STATION': 'station',
    'NAME': 'name',
    'DATE': 'date',
    'PRCP': 'precipitation',
    'TMAX': 'max_temp',
    'TMIN': 'min_temp',
    'TAVG': 'avg_temp',
})
ghana['date'] = pd.to_datetime(ghana['date'])

Load and clean ICCO price data

In [64]:
icco_prices = pd.read_csv(
    'Daily Prices_ICCO.csv',
    thousands=',',
    decimal='.',
    parse_dates=['Date'],
    dayfirst=True
)
icco_prices = icco_prices.rename(columns={
    'Date': 'date',
    'ICCO daily price (US$/tonne)': 'price_usd_per_tonne',
})

Validate data quality

In [65]:
validate_data(ghana, "Ghana data")
validate_data(icco_prices, "ICCO data")


Validating Ghana data...

Validating ICCO data...
Duplicate pairs:           date  price_usd_per_tonne
310 2023-12-15              4272.15
311 2023-12-15              4272.15
295 2024-01-09              4171.24
296 2024-01-09              4171.24
Now duplicated rows: 0


Date coverage analysis

In [66]:
date_coverage = pd.DataFrame({
    'Data Source': ['Ghana', 'ICCO'],
    'Start Date': [ghana['date'].min(), icco_prices['date'].min()],
    'End Date': [ghana['date'].max(), icco_prices['date'].max()],
    'Unique Dates': [ghana['date'].nunique(), icco_prices['date'].nunique()]
})
print("\nDate Coverage Analysis:")
print(f"\n{date_coverage.to_string()}")


Date Coverage Analysis:

  Data Source Start Date   End Date  Unique Dates
0       Ghana 1990-01-01 2024-11-28         10944
1        ICCO 1994-10-03 2025-02-27          7808


Merge the datasets on date

In [67]:
pre_merge_counts = {
    'Ghana': len(ghana),
    'ICCO': len(icco_prices)
}

merged_data = pd.merge(
    ghana,
    icco_prices,
    on='date',
    how='inner',  # Only keep dates with both weather and price data
)

merged_data = merged_data.sort_values('date')
print(f"Pre merge counts: {pre_merge_counts}")
print(f"Merged successfully. Kept {len(merged_data)} possible records")

Pre merge counts: {'Ghana': 53231, 'ICCO': 7812}
Merged successfully. Kept 35318 possible records


Clean station names

In [68]:
all_end_with_GH = merged_data['name'].str.endswith("GH").all()
if all_end_with_GH:
    print("All names end with 'GH' - removing suffix")
    merged_data['name'] = merged_data['name'].str[:-4]
else:
    print("Not all names end with 'GH' - check naming consistency")

All names end with 'GH' - removing suffix


Add temporal features

In [69]:
merged_data['year'] = merged_data['date'].dt.year
merged_data['month'] = merged_data['date'].dt.month
merged_data['day'] = merged_data['date'].dt.day
merged_data['day_of_year'] = merged_data['date'].dt.dayofyear

Reorder columns

In [70]:
merged_data = merged_data[[
    'station', 'name', 'date', 'year', 'month', 'day', 'day_of_year',
    'precipitation', 'max_temp', 'min_temp', 'avg_temp',
    'price_usd_per_tonne'
]]

Handle missing values

In [71]:
def handle_missing(df):
    """Impute missing values with appropriate methods"""
    print("\nHandling missing values...")
    
    # Show only columns with missing values
    missing_before = df.isnull().sum()
    missing_before = missing_before[missing_before > 0]
    print(f"Missing values before handling:\n{missing_before}")

    # Precipitation: 0 for missing (assuming no rain)
    df['precipitation'] = df['precipitation'].fillna(0)
    
    # Temperatures: linear interpolation for missing
    for col in ['max_temp', 'min_temp', 'avg_temp']:
        df[col] = df[col].interpolate(method='linear', limit_direction='both')
        
    # Show missing values after handling
    missing_after = df.isnull().sum()
    missing_after = missing_after[missing_after > 0]
    print(f"Missing values after handling:\n{missing_after}")
    
    return df

In [72]:
ghana_missing = ghana.isnull().sum()
ghana_missing = ghana_missing[ghana_missing > 0]
icco_missing = icco_prices.isnull().sum()
icco_missing = icco_missing[icco_missing > 0]
print(f"\nGhana missing values:\n{ghana_missing}")
print(f"\nICCO missing values:\n{icco_missing}")

merged_data = handle_missing(merged_data)
check_anomalies(merged_data)


Ghana missing values:
precipitation    35442
max_temp         18368
min_temp         19018
dtype: int64

ICCO missing values:
Series([], dtype: int64)

Handling missing values...
Missing values before handling:
precipitation    23752
max_temp         11869
min_temp         12679
dtype: int64
Missing values after handling:
Series([], dtype: int64)

Running Data Quality Checks:
Temperature anomaly: Max temp < Avg temp
✓ Precipitation non-negativity check passed
✓ Price positivity check passed


Save daily data

In [73]:
print(f"\nSaving daily merged data to daily_merged.csv")
merged_data.to_csv('daily_merged.csv', index=False)


Saving daily merged data to daily_merged.csv


## Monthly Aggregation

In [79]:
simplified_cols = [
    'station', 'name', 'date', 'year', 'month',
    'precipitation_sum',
    'max_temp_mean',
    'min_temp_mean',
    'avg_temp_mean',
    'monthly_price_mean',
    'monthly_price_last',
]

# Step 1: Aggregate PRICES by month only (global prices)
monthly_prices = (
    merged_data.groupby(pd.Grouper(key='date', freq='M'))
    ['price_usd_per_tonne'].agg(['mean', 'last', 'std'])
    .reset_index()
    .rename(columns={
        'mean': 'monthly_price_mean',
        'last': 'monthly_price_last',
        'std': 'monthly_price_std'
    })
)

# Step 2: Aggregate WEATHER by month+station
agg_weather = (
    merged_data.groupby([pd.Grouper(key='date', freq='M'), 'station', 'name'])
    .agg({
        'precipitation': ['sum', 'max', 'count'],
        'max_temp': ['mean', 'max'],
        'min_temp': ['mean', 'min'],
        'avg_temp': ['mean', 'std']
    })
    .pipe(lambda df: df.set_axis(
        ['_'.join(col).strip() for col in df.columns], 
        axis=1
    ))
    .reset_index()
)

# Step 3: Merge the monthly prices back into weather data
agg_data = pd.merge(
    agg_weather,
    monthly_prices,
    on='date',
    how='left'
)

# Add year/month columns
agg_data = agg_data.assign(
    year=lambda x: x['date'].dt.year,
    month=lambda x: x['date'].dt.month
)

# [['station', 'name', 'date', 'year', 'month',
#   'precipitation_sum', 'precipitation_max', 'precipitation_count',
#   'max_temp_mean', 'max_temp_max',
#   'min_temp_mean', 'min_temp_min',
#   'avg_temp_mean', 'avg_temp_std',
#   'price_usd_per_tonne_mean', 'price_usd_per_tonne_last', 'price_usd_per_tonne_std']]

# Handle NaN std values (now only for temperature)
std_columns = [col for col in agg_data.columns if '_std' in col]
for col in std_columns:
    agg_data[col] = agg_data[col].fillna(0)

# Verify prices are consistent per month
print("Price consistency check:")
print(agg_data.groupby('date')['monthly_price_mean'].nunique().value_counts())
# Should show all 1's (only 1 unique price per month)

agg_data.to_csv('monthly_merged.csv', index=False)

# ====== TIME-BASED SPLIT ======
def time_series_split(df, test_ratio=0.2):
    """Split DataFrame chronologically into train and test sets"""
    df = df.sort_values('date')  # Ensure chronological order
    split_idx = int(len(df) * (1 - test_ratio))
    return df.iloc[:split_idx], df.iloc[split_idx:]

# Split only the full dataset
train_full, test_full = time_series_split(agg_data)

# Create simplified versions FROM THE SPLIT DATA
train_simple = train_full[simplified_cols]
test_simple = test_full[simplified_cols]

# ====== VERIFICATION ======
def print_date_ranges(df, name):
    min_date = f"{df['date'].min().year}-{df['date'].min().month:02d}"
    max_date = f"{df['date'].max().year}-{df['date'].max().month:02d}"
    print(f"{name}: {min_date} to {max_date} ({len(df)} months)")

print("\nDate Range Verification:")
print_date_ranges(train_full, "Train Full")
print_date_ranges(test_full, "Test Full") 

# ====== SAVING ======
datasets = {
    'train_full.csv': train_full,
    'test_full.csv': test_full,
    'train_simple.csv': train_simple,
    'test_simple.csv': test_simple
}

for filename, data in datasets.items():
    data = data.drop(columns=['date'])
    data.to_csv(filename, index=False)
    print(f"Saved {filename}")

print("\nAll data processed and split successfully.")

Price consistency check:
1    359
Name: monthly_price_mean, dtype: int64

Date Range Verification:
Train Full: 1994-10 to 2020-02 (2296 months)
Test Full: 2020-02 to 2024-11 (575 months)
Saved train_full.csv
Saved test_full.csv
Saved train_simple.csv
Saved test_simple.csv

All data processed and split successfully.
