In [None]:
# --- Step 1: Import libraries
import pandas as pd
from google.colab import files

# --- Step 2: Manually upload the files
print("ðŸ“‚ Please upload the GDELT tone CSV file")
uploaded_gdelt = files.upload()

print("ðŸ“‚ Please upload the Merged Dataset Excel file")
uploaded_merged = files.upload()

# --- Step 3: Read the uploaded files
# Get file names automatically
gdelt_filename = list(uploaded_gdelt.keys())[0]
merged_filename = list(uploaded_merged.keys())[0]

# Load datasets
gdelt_df = pd.read_csv(gdelt_filename)
merged_df = pd.read_excel(merged_filename)

print("âœ… Files loaded successfully!")
print("GDELT shape:", gdelt_df.shape)
print("Merged dataset shape:", merged_df.shape)

# --- Step 4: Inspect columns
print("\nðŸ”¹ GDELT columns:", gdelt_df.columns.tolist())
print("ðŸ”¹ Merged dataset columns:", merged_df.columns.tolist())

# --- Step 5: Convert GDELT 'Month' column to 'Date' column
# Handles both 202501 and 2025-01 formats
if gdelt_df['Month'].dtype == 'int64':
    gdelt_df['Month'] = gdelt_df['Month'].astype(str)

gdelt_df['Date'] = pd.to_datetime(
    gdelt_df['Month'].astype(str).str[:4] + '-' + gdelt_df['Month'].astype(str).str[-2:] + '-01',
    errors='coerce'
)

# Drop the old 'Month' column to avoid confusion
gdelt_df.drop(columns=['Month'], inplace=True)

# --- Step 6: Convert 'Date' column in merged dataset
merged_df['Date'] = pd.to_datetime(merged_df['Date'], errors='coerce')

# --- Step 7: Merge datasets on 'Date'
final_df = pd.merge(merged_df, gdelt_df, on='Date', how='left')

# --- Step 8: Save merged output
output_path = "Merged_with_GDELT.csv"
final_df.to_csv(output_path, index=False)

print(f"\nâœ… Merge complete! File saved as: {output_path}")
print("Final merged shape:", final_df.shape)

# --- Step 9: Preview result
final_df.head()


ðŸ“‚ Please upload the GDELT tone CSV file


Saving gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv to gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv
ðŸ“‚ Please upload the Merged Dataset Excel file


Saving HPI_regional_merged_2005_2025_ddmmyyyy_MODIFIED.xlsx to HPI_regional_merged_2005_2025_ddmmyyyy_MODIFIED.xlsx
âœ… Files loaded successfully!
GDELT shape: (246, 4)
Merged dataset shape: (99630, 20)

ðŸ”¹ GDELT columns: ['Month', 'AvgTone_Stitched', 'Docs_Stitched', 'Source']
ðŸ”¹ Merged dataset columns: ['Date', 'RegionName', 'AreaCode', 'AveragePrice', 'Index', 'SalesVolume', 'AWE_Total', 'AWE_Regular', 'UnemploymentRate', 'CPI', 'MortgageApprovals', 'MortgageRate_2YFix', 'BankRate', 'ConsumerConfidence', 'BM_New Housing', 'gt_trend_buying_demand', 'gt_trend_economic_policy', 'gt_trend_market_awareness', 'gt_trend_mortgage_financing', 'gt_trend_renting_affordability']

âœ… Merge complete! File saved as: Merged_with_GDELT.csv
Final merged shape: (190755, 23)


Unnamed: 0,Date,RegionName,AreaCode,AveragePrice,Index,SalesVolume,AWE_Total,AWE_Regular,UnemploymentRate,CPI,...,ConsumerConfidence,BM_New Housing,gt_trend_buying_demand,gt_trend_economic_policy,gt_trend_market_awareness,gt_trend_mortgage_financing,gt_trend_renting_affordability,AvgTone_Stitched,Docs_Stitched,Source
0,2005-01-01,Aberdeenshire,S12000034,105489,51.3,400,379.071339,353.799926,4.8,1.7,...,0.043352,70.1,-0.85231,-0.148716,-0.980873,-0.918624,-1.761248,5.659953,3443.0,Events_v1_0_ECON
1,2005-01-01,Aberdeenshire,S12000034,105489,51.3,400,379.071339,353.799926,4.8,1.7,...,0.043352,70.1,-0.85231,-0.148716,-0.980873,-0.918624,-1.761248,5.84982,4481.0,Events_v1_0_ECON
2,2005-01-01,Aberdeenshire,S12000034,105489,51.3,400,379.071339,353.799926,4.8,1.7,...,0.043352,70.1,-0.85231,-0.148716,-0.980873,-0.918624,-1.761248,6.084276,3601.0,Events_v1_0_ECON
3,2005-01-01,Aberdeenshire,S12000034,105489,51.3,400,379.071339,353.799926,4.8,1.7,...,0.043352,70.1,-0.85231,-0.148716,-0.980873,-0.918624,-1.761248,6.143649,2613.0,Events_v1_0_ECON
4,2005-01-01,Aberdeenshire,S12000034,105489,51.3,400,379.071339,353.799926,4.8,1.7,...,0.043352,70.1,-0.85231,-0.148716,-0.980873,-0.918624,-1.761248,5.747168,2777.0,Events_v1_0_ECON


In [None]:
# ==== Robust Colab merge: GDELT Month -> Date (month start), merge left on Date ====
import pandas as pd
from google.colab import files

print("ðŸ“‚ Upload GDELT tone CSV (e.g., gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv)")
up_gdelt = files.upload()
print("ðŸ“‚ Upload Merged Dataset Excel (e.g., Merged dataset.xlsx)")
up_merged = files.upload()

gdelt_file = list(up_gdelt.keys())[0]
merged_file = list(up_merged.keys())[0]

gdelt = pd.read_csv(gdelt_file)
merged = pd.read_excel(merged_file)

def to_month_start(series):
    s = series.copy()
    dt = pd.to_datetime(s, errors='coerce', infer_datetime_format=True)
    # Fill any remaining NaT that look like YYYYMM
    s_str = s.astype(str)
    yyyymm_mask = s_str.str.fullmatch(r"\d{6}", na=False)
    if yyyymm_mask.any():
        dt = dt.fillna(pd.to_datetime(s_str[yyyymm_mask], format="%Y%m", errors="coerce"))
    return dt.dt.to_period("M").dt.to_timestamp()  # month start

# Expect: GDELT has 'Month', merged has 'Date'
if "Month" not in gdelt.columns:
    raise ValueError("GDELT CSV must contain a 'Month' column. Found: " + ", ".join(gdelt.columns))
if "Date" not in merged.columns:
    raise ValueError("Merged dataset must contain a 'Date' column. Found: " + ", ".join(merged.columns))

gdelt['Date'] = to_month_start(gdelt['Month'])
merged['Date'] = to_month_start(merged['Date'])

# Remove any accidental duplicates in GDELT by Date
gdelt = gdelt.sort_values('Date').drop_duplicates(subset=['Date'], keep='first')

# Keep only value columns from GDELT (avoid key clashes)
gdelt_value_cols = [c for c in gdelt.columns if c not in ['Month', 'Date']]
gdelt_for_merge = gdelt[['Date'] + gdelt_value_cols].copy()

# Left merge
final = merged.merge(gdelt_for_merge, on='Date', how='left')

# Sanity checks
assert len(final) == len(merged), "Row count changed â€” check keys/duplicates."
assert merged['Date'].value_counts().sort_index().equals(final['Date'].value_counts().sort_index()), \
       "Per-date row counts changed â€” check Date normalization."

out = "Master file.csv"
final.to_csv(out, index=False)
print("âœ… Merge complete. Saved:", out)

# Optional: quick null-rate check for GDELT columns
for c in gdelt_value_cols:
    print(f"{c}: null rate = {final[c].isna().mean():.2%}")


ðŸ“‚ Upload GDELT tone CSV (e.g., gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv)


Saving gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv to gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv
ðŸ“‚ Upload Merged Dataset Excel (e.g., Merged dataset.xlsx)


Saving HPI_regional_merged_2005_2025_ddmmyyyy_MODIFIED.xlsx to HPI_regional_merged_2005_2025_ddmmyyyy_MODIFIED.xlsx


  dt = pd.to_datetime(s, errors='coerce', infer_datetime_format=True)
  dt = pd.to_datetime(s, errors='coerce', infer_datetime_format=True)


âœ… Merge complete. Saved: Master file.csv
AvgTone_Stitched: null rate = 0.00%
Docs_Stitched: null rate = 0.00%
Source: null rate = 0.00%
