In [70]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import re
import ast

In [71]:
# Read the CSV files
listings = pd.read_csv("listings.csv")
listings_scrap = pd.read_csv("listings_scrapy.csv")

# Merge the two dataframes on the 'href' column
merged_df = pd.merge(listings, listings_scrap, on='href', how='inner')

In [72]:
merged_df['main_features'] = merged_df['main_features'].apply(ast.literal_eval)

# Extracting numeric values and converting to integers
merged_df['Floor Area'] = merged_df['main_features'].apply(lambda x: int(re.sub(r'\D', '', x.get('Floor Area', '0'))))
merged_df['Land Area'] = merged_df['main_features'].apply(lambda x: int(re.sub(r'\D', '', x.get('Land Area', '0'))))
merged_df['Rates'] = merged_df['main_features'].apply(lambda x: int(re.sub(r'\D', '', x.get('Rates', '0'))))
merged_df['Levy'] = merged_df['main_features'].apply(lambda x: int(re.sub(r'\D', '', x.get('Levy', '0'))))

In [73]:
merged_df['key_main_features'] = merged_df['key_main_features'].apply(ast.literal_eval)

# Extracting numeric values for Bedrooms and Bathrooms and converting to float
merged_df['Bedrooms'] = merged_df['key_main_features'].apply(lambda x: float(re.search(r'\d+(\.\d+)?', x.get('Bedrooms', '0')).group()))
merged_df['Bathrooms'] = merged_df['key_main_features'].apply(lambda x: float(re.search(r'\d+(\.\d+)?', x.get('Bathrooms', '0')).group()))

In [74]:
merged_df['Province'] = merged_df['href'].apply(lambda x: x.split('/')[4])
merged_df['City'] = merged_df['href'].apply(lambda x: x.split('/')[5])
merged_df['Suburb'] = merged_df['href'].apply(lambda x: x.split('/')[6])
merged_df['Area'] = merged_df['href'].apply(lambda x: x.split('/')[7])
# Extracting the listing number from the URL
merged_df['Listing Number'] = merged_df['href'].apply(lambda x: x.split('/')[-1])

In [75]:
merged_df.head()

Unnamed: 0,title,href,price,property_type,main_features,key_main_features,Floor Area,Land Area,Rates,Levy,Bedrooms,Bathrooms,Province,City,Suburb,Area,Listing Number
0,1 Bed Apartment in Ferndale,https://www.privateproperty.co.za/for-sale/gau...,R 280 000,Apartment,"{'Floor Area': '47 m²', 'Rates': 'R 197', 'Lev...","{'Bedrooms': '1', 'Bathrooms': '1', 'Lounges':...",47,0,197,1677,1.0,1.0,gauteng,johannesburg,randburg-and-ferndale,ferndale,T4178562
1,1 Bed Apartment in Ferndale,https://www.privateproperty.co.za/for-sale/gau...,R 280 000,Apartment,"{'Floor Area': '47 m²', 'Rates': 'R 265', 'Lev...","{'Bedrooms': '1', 'Bathrooms': '1', 'Lounges':...",47,0,265,1296,1.0,1.0,gauteng,johannesburg,randburg-and-ferndale,ferndale,T4441362
2,1 Bed Apartment in Windsor West,https://www.privateproperty.co.za/for-sale/gau...,R 300 000,Apartment,"{'Floor Area': '61 m²', 'Land Area': '3975 m²'...","{'Bedrooms': '1', 'Bathrooms': '1', 'Dining Ar...",61,3975,170,515,1.0,1.0,gauteng,johannesburg,randburg-and-ferndale,windsor-west,T4260403
3,2 Bed Apartment in Windsor East,https://www.privateproperty.co.za/for-sale/gau...,R 310 000,Apartment,"{'Floor Area': '69 m²', 'Land Area': '3970 m²'...","{'Bedrooms': '2', 'Bathrooms': '1', 'Dining Ar...",69,3970,300,500,2.0,1.0,gauteng,johannesburg,randburg-and-ferndale,windsor-east,T4482605
4,2.5 Bed Apartment in Windsor East,https://www.privateproperty.co.za/for-sale/gau...,R 315 000,Apartment,"{'Floor Area': '79 m²', 'Levy': 'R 1000'}","{'Bedrooms': '2.5', 'Bathrooms': '1', 'Dining ...",79,0,0,1000,2.5,1.0,gauteng,johannesburg,randburg-and-ferndale,windsor-east,T3962829


In [76]:
sold_properties = merged_df[merged_df['price']=='Sold']
# Adding Sold Date column with today's date
sold_properties['Sold Date'] = datetime.today().date()

# Define the file path
file_path = r'C:\Users\lakha\OneDrive\Documents\House Flipping - Real Life\Local\Sold Properties'

# Generate the file name with today's date
file_name = "Sold Properties " + datetime.today().strftime('%Y-%m-%d') + ".csv"

# Combine the file path and file name
full_file_path = file_path + "\\" + file_name

# Save the DataFrame to CSV
sold_properties.to_csv(full_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sold_properties['Sold Date'] = datetime.today().date()


In [77]:
listed_properties = merged_df[(merged_df['price'] != 'Sold') & (merged_df['price'] != 'On Auction')]
# Remove non-numeric characters and convert to numeric
listed_properties['price'] = listed_properties['price'].str.replace('[^\d.]', '', regex=True)

# Convert the column to numeric
listed_properties['price'] = pd.to_numeric(listed_properties['price'], errors='coerce')
# Define the directory path

listed_properties['Price per sqm'] = listed_properties['price']/listed_properties['Floor Area']
directory_path = r'C:\Users\lakha\OneDrive\Documents\House Flipping - Real Life\Local\Listed Properties'

# Save as "Current Listed Properties.csv"
listed_properties.to_csv(directory_path + "\\Current Listed Properties.csv", index=False)

# Save with today's date
today_date = datetime.today().strftime('%Y-%m-%d')
file_name = f"Listed Properties {today_date}.csv"
listed_properties.to_csv(directory_path + "\\" + file_name, index=False)

  listed_properties['price'] = listed_properties['price'].str.replace('[^\d.]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listed_properties['price'] = listed_properties['price'].str.replace('[^\d.]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listed_properties['price'] = pd.to_numeric(listed_properties['price'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

In [78]:
listed_properties

Unnamed: 0,title,href,price,property_type,main_features,key_main_features,Floor Area,Land Area,Rates,Levy,Bedrooms,Bathrooms,Province,City,Suburb,Area,Listing Number,Price per sqm
0,1 Bed Apartment in Ferndale,https://www.privateproperty.co.za/for-sale/gau...,280000.0,Apartment,"{'Floor Area': '47 m²', 'Rates': 'R 197', 'Lev...","{'Bedrooms': '1', 'Bathrooms': '1', 'Lounges':...",47,0,197,1677,1.0,1.0,gauteng,johannesburg,randburg-and-ferndale,ferndale,T4178562,5.957447e+03
1,1 Bed Apartment in Ferndale,https://www.privateproperty.co.za/for-sale/gau...,280000.0,Apartment,"{'Floor Area': '47 m²', 'Rates': 'R 265', 'Lev...","{'Bedrooms': '1', 'Bathrooms': '1', 'Lounges':...",47,0,265,1296,1.0,1.0,gauteng,johannesburg,randburg-and-ferndale,ferndale,T4441362,5.957447e+03
2,1 Bed Apartment in Windsor West,https://www.privateproperty.co.za/for-sale/gau...,300000.0,Apartment,"{'Floor Area': '61 m²', 'Land Area': '3975 m²'...","{'Bedrooms': '1', 'Bathrooms': '1', 'Dining Ar...",61,3975,170,515,1.0,1.0,gauteng,johannesburg,randburg-and-ferndale,windsor-west,T4260403,4.918033e+03
3,2 Bed Apartment in Windsor East,https://www.privateproperty.co.za/for-sale/gau...,310000.0,Apartment,"{'Floor Area': '69 m²', 'Land Area': '3970 m²'...","{'Bedrooms': '2', 'Bathrooms': '1', 'Dining Ar...",69,3970,300,500,2.0,1.0,gauteng,johannesburg,randburg-and-ferndale,windsor-east,T4482605,4.492754e+03
4,2.5 Bed Apartment in Windsor East,https://www.privateproperty.co.za/for-sale/gau...,315000.0,Apartment,"{'Floor Area': '79 m²', 'Levy': 'R 1000'}","{'Bedrooms': '2.5', 'Bathrooms': '1', 'Dining ...",79,0,0,1000,2.5,1.0,gauteng,johannesburg,randburg-and-ferndale,windsor-east,T3962829,3.987342e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16887,2 Bed Apartment in Randpark Ridge,https://www.privateproperty.co.za/for-sale/gau...,875000.0,Apartment,"{'Floor Area': '82 m²', 'Land Area': '2871 m²'...","{'Bedrooms': '2', 'Bathrooms': '1.5', 'Dining ...",82,2871,727,2190,2.0,1.5,gauteng,johannesburg,randburg-and-ferndale,randpark-ridge,T4497070,1.067073e+04
16888,2 Bed House in Robindale,https://www.privateproperty.co.za/for-sale/gau...,875000.0,House,"{'Floor Area': '70 m²', 'Rates': 'R 403', 'Lev...","{'Bedrooms': '2', 'Bathrooms': '1', 'Covered P...",70,0,403,2145,2.0,1.0,gauteng,johannesburg,randburg-and-ferndale,robindale,T4468178,1.250000e+04
16889,2 Bed Apartment in Robindale,https://www.privateproperty.co.za/for-sale/gau...,875000.0,Apartment,"{'Floor Area': '75 m²', 'Rates': 'R 762', 'Lev...","{'Bedrooms': '2', 'Bathrooms': '2', 'Dining Ar...",75,0,762,2953,2.0,2.0,gauteng,johannesburg,randburg-and-ferndale,robindale,T4324108,1.166667e+04
16890,2 Bed Apartment in Ferndale,https://www.privateproperty.co.za/for-sale/gau...,875000.0,Apartment,"{'Rates': 'R 1680', 'Levy': 'R 630'}","{'Bedrooms': '2', 'Bathrooms': '2', 'Covered P...",0,0,1680,630,2.0,2.0,gauteng,johannesburg,randburg-and-ferndale,ferndale,T4452901,inf


## Market Research

In [79]:
file_path = "C:/Users/lakha/OneDrive/Documents/House Flipping - Real Life/Local/Listed Properties/Current Listed Properties.csv"
listed_properties = pd.read_csv(file_path)

In [80]:
# Function to process each CSV file
def process_csv(file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Remove duplicates based on "List ID" column, keeping the oldest "Sold Date"
    df['Sold Date'] = pd.to_datetime(df['Sold Date'])  # Convert 'Sold Date' to datetime
    df.sort_values(by='Sold Date', inplace=True)  # Sort by 'Sold Date' to get the oldest first
    df.drop_duplicates(subset='Listing Number', keep='first', inplace=True)
    
    return df

# Define the directory where the CSV files are located
directory = 'C:/Users/lakha/OneDrive/Documents/House Flipping - Real Life/Local/Sold Properties'

# List to store processed DataFrames
dfs = []

# Iterate through files in the directory
for file in os.listdir(directory):
    # Check if the file is a CSV file
    if file.endswith('.csv') and file.startswith('Sold Properties'):
        # Process the CSV file and append the DataFrame to the list
        file_path = os.path.join(directory, file)
        df = process_csv(file_path)
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
sold_properties = pd.concat(dfs, ignore_index=True)

In [81]:
# Get the date 30 days ago from today
thirty_days_ago = datetime.now() - timedelta(days=30)

# Filter the DataFrame to include only rows where sale date is within the last 30 days
sold_properties = sold_properties[sold_properties['Sold Date'] >= thirty_days_ago]

In [82]:
listed_property_count = listed_properties.groupby(['Province', 'City', 'Suburb', 'Area']).size().reset_index(name='Listed')
sold_property_count = sold_properties.groupby(['Province', 'City', 'Suburb', 'Area']).size().reset_index(name='Sold')

In [83]:
# Merging two dataframes on the 'Area' column and excluding common columns
# Merging two dataframes on the 'Area' column
merged_df = pd.merge(listed_property_count, sold_property_count, on='Area', how='left', suffixes=('_listed', '_sold'))

# Drop unwanted columns (those with '_y' suffix)
merged_df.drop(merged_df.filter(regex='_sold$').columns.tolist(), axis=1, inplace=True)

# Rename columns with '_x' suffix to remove the suffix
merged_df.rename(columns=lambda x: x.replace('_listed', ''), inplace=True)
merged_df['Inventory'] = merged_df['Listed']/merged_df['Sold']
merged_df.to_csv('Property Market 30 days.csv', index=False)

# Creating a file for all sold properties

In [84]:
sold_properties

Unnamed: 0,title,href,price,property_type,main_features,key_main_features,Floor Area,Land Area,Rates,Levy,Bedrooms,Bathrooms,Province,City,Suburb,Area,Listing Number,Sold Date
0,1 Bed Apartment in Houghton Estate,https://www.privateproperty.co.za/for-sale/gau...,Sold,Apartment,,,59,,R 195,,1.0,1.0,gauteng,johannesburg,rosebank-and-parktown,houghton-estate,T3978315,2024-02-23
1,3 Bed Cluster in Beverley,https://www.privateproperty.co.za/for-sale/gau...,Sold,Cluster,,,,500 m²,R 2300,,3.0,2.0,gauteng,johannesburg,fourways-sunninghill-and-lonehill,beverley,T4281585,2024-02-23
2,4 Bed House in Paulshof,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,,,279,1530 m²,,,4.0,3.0,gauteng,johannesburg,fourways-sunninghill-and-lonehill,paulshof,T4445807,2024-02-23
3,3 Bed House in Fourways,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,,,217,443 m²,,,3.0,2.0,gauteng,johannesburg,fourways-sunninghill-and-lonehill,fourways,T4193395,2024-02-23
4,2 Bed House in Paulshof,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,,,,482 m²,R 1545,,2.0,2.0,gauteng,johannesburg,fourways-sunninghill-and-lonehill,paulshof,T4345631,2024-02-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,5 Bed House in Montgomery Park,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,"{'Land Area': '744 m²', 'Rates': 'R 1070', 'Le...","{'Bedrooms': '5', 'Bathrooms': '3', 'Dining Ar...",0,744,1070,1,5.0,3.0,gauteng,johannesburg,northcliff,montgomery-park,T4490762,2024-03-04
1561,7 Bed House in Sophiatown,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,{'Floor Area': '495 m²'},"{'Bedrooms': '7', 'Bathrooms': '3', 'Dining Ar...",495,0,0,0,7.0,3.0,gauteng,johannesburg,northcliff,sophiatown,T4192682,2024-03-04
1562,3 Bed House in Newlands,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,"{'Land Area': '496 m²', 'Rates': 'R 670'}","{'Bedrooms': '3', 'Bathrooms': '3', 'Dining Ar...",0,496,670,0,3.0,3.0,gauteng,johannesburg,northcliff,newlands,T4101674,2024-03-04
1563,4 Bed House in Westdene,https://www.privateproperty.co.za/for-sale/gau...,Sold,House,"{'Land Area': '495 m²', 'Rates': 'R 865', 'Lev...","{'Bedrooms': '4', 'Bathrooms': '3', 'Dining Ar...",0,495,865,1,4.0,3.0,gauteng,johannesburg,northcliff,westdene,T4312187,2024-03-04


# All past Listed Properties

In [85]:
# Define the directory where the CSV files are located
directory = 'C:/Users/lakha/OneDrive/Documents/House Flipping - Real Life/Local/Listed Properties'  # Update this with the appropriate directory path

# Function to process each CSV file
def process_csv(file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    df.drop_duplicates(subset='Listing Number', keep='first', inplace=True)
    
    return df

# List to store processed DataFrames
dfs = []

# Iterate through files in the directory
for file in os.listdir(directory):
    # Check if the file is a CSV file and starts with "Sold Properties"
    if file.endswith('.csv') and file.startswith('Listed Properties'):
        # Process the CSV file and append the DataFrame to the list
        file_path = os.path.join(directory, file)
        df = process_csv(file_path)
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
all_listed = pd.concat(dfs, ignore_index=True)

In [86]:
# Merge the dataframes on the 'Listing Number' column
all_sold_df = pd.merge(sold_properties, all_listed[['Listing Number', 'price']], on='Listing Number', how='left')

# Rename the 'price' column to 'previously listed price'
all_sold_df.rename(columns={'price_y': 'previously listed price'}, inplace=True)
all_sold_df.rename(columns={'price_x': 'Status'}, inplace=True)

In [87]:
directory_path = r'C:\Users\lakha\OneDrive\Documents\House Flipping - Real Life\Local\Sold Properties'

# Save as "Current Listed Properties.csv"
all_sold_df.to_csv(directory_path + "\\All Sold Properties.csv", index=False)