In [1]:
import pandas as pd
import re
import joblib

# --- Load the raw dataset ---
print("Loading appartments.csv...")
try:
    df = pd.read_csv('appartments.csv')
except FileNotFoundError:
    print("Error: 'appartments.csv' not found. Make sure it's in the same directory.")
    exit()

# --- Basic Cleaning (as done in your notebook) ---
# Drop a problematic row if it exists
if 22 in df.index:
    df = df.drop(22)
# Reset index to ensure it's continuous
df.reset_index(drop=True, inplace=True)
print("Initial data cleaning complete.")

# --- Feature Engineering: Create the 'sector' column ---
print("Extracting sector information...")

def get_sector(sub_name):
    """
    Extracts the sector (e.g., 'Sector 113') from the PropertySubName string.
    """
    if isinstance(sub_name, str):
        # Search for the pattern 'Sector' followed by one or more digits
        match = re.search(r'Sector \d+', sub_name)
        if match:
            return match.group(0) # Return the matched string (e.g., "Sector 113")
    return 'Unknown' # Return 'Unknown' if no match is found or input is not a string

# Apply the function to create the new 'sector' column
df['sector'] = df['PropertySubName'].apply(get_sector)

print(f"Successfully identified sectors. Found {df['sector'].nunique()} unique sectors.")

# --- Select and Save the Final DataFrame ---
# The Streamlit app only needs a few columns to function.
# We select these to keep the final file small and efficient.
df_final_for_app = df[['PropertyName', 'PropertySubName', 'Link', 'sector']].copy()

# Save the processed DataFrame using joblib
output_filename = 'df_processed.pkl'
joblib.dump(df_final_for_app, output_filename)

print(f"\nSuccessfully created '{output_filename}'.")
print("You can now proceed with the other preprocessing steps or run the Streamlit app.")



Loading appartments.csv...
Initial data cleaning complete.
Extracting sector information...
Successfully identified sectors. Found 56 unique sectors.

Successfully created 'df_processed.pkl'.
You can now proceed with the other preprocessing steps or run the Streamlit app.
