In [4]:
import pandas as pd

# 1. Load the Data
ons_raw = pd.read_csv('ons_retail_data.csv')

# Look at the first 50 unique values in that column to find the pattern
print("Unique date patterns in your file:")
print(ons_raw['years-quarters-months'].unique()[:50])

Unique date patterns in your file:
['2025' '2025-q4' '2025-q3' '2025-q2' '2025-q1' '2025-dec' '2025-nov'
 '2025-oct' '2025-sep' '2025-aug' '2025-jul' '2025-jun' '2025-may'
 '2025-apr' '2025-mar' '2025-feb' '2025-jan' '2024' '2024-q4' '2024-q3'
 '2024-q2' '2024-q1' '2024-dec' '2024-nov' '2024-oct' '2024-sep'
 '2024-aug' '2024-jul' '2024-jun' '2024-may' '2024-apr' '2024-mar'
 '2024-feb' '2024-jan' '2023' '2023-q4' '2023-q3' '2023-q2' '2023-q1'
 '2023-dec' '2023-nov' '2023-oct' '2023-sep' '2023-aug' '2023-jul'
 '2023-jun' '2023-may' '2023-apr' '2023-mar' '2023-feb']


In [5]:
# 2. Setup the columns we identified
date_col = 'years-quarters-months'
value_col = 'v4_1'

# 3. Create a working copy and force numeric values
df = ons_raw.copy()
df['market_index'] = pd.to_numeric(df[value_col], errors='coerce')

# 4. FILTER FOR MONTHLY DATA (e.g., '2010-jan')
# This regex looks for 4 digits, a hyphen, and 3 letters
# We also filter for your specific years (2009, 2010, 2011)
mask = df[date_col].str.contains(r'(2009|2010|2011)-(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', case=False, na=False)
ons_monthly = df[mask].copy()

# 5. CONVERT TO DATETIME
# Pandas is smart enough to handle '2010-jan' once we point to it
ons_monthly['report_date'] = pd.to_datetime(ons_monthly[date_col], format='%Y-%b', errors='coerce')

# 6. DEDUPLICATE & CLEAN
# The ONS file has 42,000 rows because it breaks data down by region/category.
# We take the mean to get the 'Overall UK Market Index' per month.
ons_final = ons_monthly.dropna(subset=['report_date', 'market_index'])
ons_final = ons_final.groupby('report_date')['market_index'].mean().reset_index()

# 7. Final Check
print("\n--- ONS CLEANING SUCCESS ---")
print(f"Total months found: {len(ons_final)}")
if len(ons_final) > 0:
    print(ons_final.sort_values('report_date').head())
    
# 8. Save for SQL
    ons_final.to_csv('cleaned_ons_market.csv', index=False)
    print("\nFile 'cleaned_ons_market.csv' is ready for MySQL!")
else:
    print("Zero rows found. Check if the years 2009-2011 exist in your specific CSV file.")


--- ONS CLEANING SUCCESS ---
Total months found: 36
  report_date  market_index
0  2009-01-01     88.775758
1  2009-02-01     86.581818
2  2009-03-01     86.210606
3  2009-04-01     86.650000
4  2009-05-01     86.871212

File 'cleaned_ons_market.csv' is ready for MySQL!


  mask = df[date_col].str.contains(r'(2009|2010|2011)-(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', case=False, na=False)
