# Data Sepapration

Separate the prepared data to training data (2023) and testing data (2024-2026)

## Read 2023 Data

In [13]:
import pandas as pd
import geopandas as gpd

In [14]:
rental_2023 = pd.read_csv('../data/curated/train/rental_price/rental_price_2023.csv')
income_2023 = pd.read_csv('../data/curated/predict/income_data/income_data_2023.csv')
population_2023 = pd.read_csv('../data/curated/predict/population_data/population_data_2023.csv')
crime_2023 = pd.read_csv('../data/curated/train/crime_data/crime_data_2023.csv')
sa2_postcode = pd.read_csv('../data/raw/Processed_SA2_Postcode.csv')

### Merge Training Data into One Dataframe

In [15]:
merged_df = sa2_postcode.merge(rental_2023, on='SA2', how='left')\
    .merge(income_2023, on='SA2', how='left')\
    .merge(population_2023, on='SA2', how='left')\
    .merge(crime_2023, on='SA2', how='left')

merged_df = merged_df.drop_duplicates()
merged_df = merged_df.dropna(subset=['price'])
merged_df = merged_df.dropna(subset=['avg_income'])
merged_df = merged_df.dropna(subset=['avg_population'])

merged_df['avg_crime'].fillna(0, inplace=True)

merged_df = merged_df.drop_duplicates()

merged_df.to_csv('../data/curated/train/train_data_2023.csv', index=False)

merged_df

Unnamed: 0,postcode,SA2,price,avg_income,avg_population,avg_crime
4,3002,206041119,616.000000,53760.677768,11515.772832,15.622642
5,3003,206041127,580.410959,53760.676860,11515.772507,23.313725
8,3005,206041118,624.276316,53760.677881,11515.772873,46.016129
12,3008,206041118,624.276316,53760.677881,11515.772873,46.016129
14,3010,206041124,583.392857,53760.677200,11515.772629,11.370370
...,...,...,...,...,...,...
3070,3990,205031093,455.727273,53875.261563,11556.790845,5.596859
3077,3991,205031093,455.727273,53875.261563,11556.790845,5.596859
3084,3992,205031093,455.727273,53875.261563,11556.790845,5.596859
3091,3995,205031093,455.727273,53875.261563,11556.790845,5.596859


## Merge Testing Data into One for Each Year

In [16]:
def test_data_merge(year, sa2_postcode):

    """
    Merges income, population, and crime data for a specific year with SA2 postal 
    code information to create test data.

    Parameters:
        year : The specific year for which test data is required.
        sa2_postcode : The DataFrame containing SA2 postal code information.

    Returns:
        DataFrame: A merged DataFrame containing test data for the given year.
    """

    income = pd.read_csv(f'../data/curated/predict/income_data/income_data_{year}.csv')
    population = pd.read_csv(f'../data/curated/predict/population_data/population_data_{year}.csv')
    crime = pd.read_csv(f'../data/curated/predict/crime_data/crime_data_{year}.csv')

    merged_df = sa2_postcode.merge(income, on='SA2', how='left')\
        .merge(population, on='SA2', how='left')\
        .merge(crime, on='SA2', how='left')

    mean_income = merged_df['avg_income'].mean()
    mean_population = merged_df['avg_population'].mean()

    merged_df['avg_income'].fillna(mean_income, inplace=True)
    merged_df['avg_population'].fillna(mean_population, inplace=True)
    merged_df['avg_crime'].fillna(0, inplace=True)
        
    merged_df.to_csv(f'../data/curated/predict/rental_price/test_data_{year}.csv', index=False)

    return

In [17]:
YEARS = [2024, 2025, 2026]

for year in YEARS:
    test_data_merge(year, sa2_postcode)

In [18]:
pd.read_csv('../data/curated/predict/rental_price/test_data_2024.csv')

Unnamed: 0,postcode,SA2,avg_income,avg_population,avg_crime
0,3000,206041122,54922.716263,11391.369215,0.000000
1,3001,206041122,54922.716263,11391.369215,0.000000
2,3002,206041119,55297.693910,11525.601442,10.772384
3,3003,206041127,55297.693002,11525.601117,10.772384
4,3004,206041126,54922.716263,11391.369215,0.000000
...,...,...,...,...,...
716,3990,205031093,55412.277705,11566.619455,10.728726
717,3991,205031093,55412.277705,11566.619455,10.728726
718,3992,205031093,55412.277705,11566.619455,10.728726
719,3995,205031093,55412.277705,11566.619455,10.728726
