### This was created by Julia Drygalska

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import boto3


# Scraping
### For this part, I creaed a bucket in my AWS environment and uploaded the dataset inside of it for later use. Then, to avoid memory issues i created a subset of 30% of all dataset and saved it locally. Next step is explained in the "cleaning" notebook, since I have to clean it before I can use it.

In [None]:
# Step 1: Load CSV from S3

# Define the S3 URI for the original CSV dataset
s3_csv_uri = 's3://bucketjdrygalska/price_paid_records.csv'

# Load the CSV file from S3
print("Loading CSV from S3...")
df = pd.read_csv(s3_csv_uri)
print("CSV loaded successfully!")

# Remove the first 70% of rows
percent_to_remove = 70
rows_to_remove = int(len(df) * (percent_to_remove / 100))
df_subset = df.iloc[rows_to_remove:]
print(f"Removed the first {percent_to_remove}% of rows. Remaining rows: {len(df_subset)}")

# Define the local path to save files inside the UK Housing directory
local_path = './datasets/'

# Ensure the local path exists
if not os.path.exists(local_path):
    os.makedirs(local_path)

# Save the subset DataFrame to CSV format (locally, inside UK Housing directory)
local_csv_path = local_path + 'price_paid_records_subset.csv'
print("Saving subset dataset as CSV locally...")
df_subset.to_csv(local_csv_path, index=False)
print("Subset dataset saved as CSV locally successfully!")


### After I have my subset cleaned, I split it into 2 different csv files: one for training, one for testing.

In [3]:
# Split Dataset by Date for Training and Testing

# Load cleaned dataset
df = pd.read_csv('./datasets/price_paid_records_cleaned.csv')

# Convert 'date_of_transfer' to datetime for filtering purposes
df['date_of_transfer'] = pd.to_datetime(df['date_of_transfer'], errors='coerce')

# Drop rows with invalid 'date_of_transfer'
df = df.dropna(subset=['date_of_transfer'])

# Split the dataset into training and testing sets based on 'date_of_transfer'
train_data = df[df['date_of_transfer'].dt.year <= 2015].copy()
test_data = df[df['date_of_transfer'].dt.year > 2015].copy()

# Save the training and testing datasets
train_data_path = 'datasets/train_data.csv'
train_data.to_csv(train_data_path, index=False)

test_data_path = 'datasets/test_data.csv'
test_data.to_csv(test_data_path, index=False)
