In [1]:
import json
import pandas as pd
import re

In [2]:
with open('../data/raw/apartments_raw.json', 'r') as file:
    apartments_raw = json.load(file)
    
df_apartments = pd.DataFrame(apartments_raw)

print(len(apartments_raw), "apt listings")
print(df_apartments.shape)
print(df_apartments.head())

1264 apt listings
(1264, 6)
                                                name  \
0  Free Cable and Internet, 2/BD 2.5/BA, in Los A...   
1  Arwyn Manor Apts...NY Style...Gorgeous Newly R...   
2  St. Andrews Apts.Charming Studio.1920's Classi...   
3                                    Studio for Rent   
4  Located in Los Angeles, 2/bd 2/ba, Dressing Va...   

                                  address zip_code price_raw bedrooms_raw  \
0  1239 West 30th Street, Los Angeles, CA             $3,395                
1              Mid-Wilshire and Koreatown    90006    $1,475       studio   
2              Mid-Wilshire and Koreatown    90006    $1,425       studio   
3                             Los Angeles             $1,000       studio   
4        1026 S Broadway, Los Angeles, CA             $2,649                

                                                 url  
0  https://losangeles.craigslist.org/lac/apa/d/lo...  
1  https://losangeles.craigslist.org/lac/apa/d/lo...  
2  http

In [3]:
def clean_price(price_text):
    """Clean and standardize apartment price text into a usable numeric format."""

    if not price_text:
        return None
        
    price_text = price_text.replace('$', '').replace(',', '').replace('+', '')
    if '-' in price_text:
        parts = price_text.split('-')
        if len(parts) == 2:
            try:
                low = float(parts[0].strip())
                high = float(parts[1].strip())
                return (low + high) / 2
            except:
                return None
    try:
        return float(price_text.strip())
    except:
        return None
        
df_apartments['price'] = df_apartments['price_raw'].apply(clean_price)

print("num pices used:", df_apartments['price'].notna().sum())
print(df_apartments['price'].describe())

num pices used: 1264
count     1264.000000
mean      2387.316456
std       1505.358133
min        750.000000
25%       1520.000000
50%       2059.000000
75%       2652.250000
max      21081.000000
Name: price, dtype: float64


In [4]:
def clean_bedrooms(bed_text):
    """Clean and extract bedroom count from listing text."""

    if not bed_text:
        return None
    bed_text = bed_text.lower()
    
    if 'studio' in bed_text:
        return 0
    numbers = re.findall(r'\d+', bed_text)
    
    if numbers:
        return int(numbers[0])
    return None
    
df_apartments['bedrooms'] = df_apartments['bedrooms_raw'].apply(clean_bedrooms)


print("valid bedrmss:", df_apartments['bedrooms'].notna().sum())
print(df_apartments['bedrooms'].value_counts().sort_index())

valid bedrmss: 696
bedrooms
0.0    296
1.0    188
2.0    144
3.0     48
4.0      4
5.0     16
Name: count, dtype: int64


In [5]:
def calculate_rent_per_bedroom(row):
    """Compute rent per bedroom from cleaned price and bedroom data."""

    price = row['price']
    beds = row['bedrooms']
    if pd.isna(price) or pd.isna(beds):
        return None

    if beds == 0:
        return price
    return price / beds

df_apartments['rent_per_bedroom'] = df_apartments.apply(calculate_rent_per_bedroom, axis=1)


print("num of rent per beds:", df_apartments['rent_per_bedroom'].notna().sum())
print(df_apartments['rent_per_bedroom'].describe())

num of rent per beds: 696
count     696.000000
mean     1573.246169
std       490.076589
min       750.000000
25%      1269.000000
50%      1433.000000
75%      1867.000000
max      3795.000000
Name: rent_per_bedroom, dtype: float64


In [6]:
usc_zip_codes = ['90007', '90089','90015','90017', '90006', '90012','90013','90014', '90021', '90037','90018', '90019', '90026', '90057', '90020','90005','90004','90010']
df_apartments = df_apartments[df_apartments['zip_code'].isin(usc_zip_codes)]


print("num listings in usc are a:", len(df_apartments))

num listings in usc are a: 464


In [7]:
#more cleanignt o gt rid of useless
df_apartments = df_apartments[df_apartments['price'].notna()]
df_apartments = df_apartments[df_apartments['zip_code'].notna()]
df_apartments = df_apartments[df_apartments['price'] > 0]
output_file = '../data/processed/apartments_clean.csv'
df_apartments.to_csv(output_file, index=False)



print("final num of apts:", len(df_apartments))
print(output_file)

final num of apts: 464
../data/processed/apartments_clean.csv


In [8]:
df_crime = pd.read_csv('../data/raw/Crime_Data_from_2020_to_Present.csv', low_memory=False)


print("total crime recs:", len(df_crime))
print(df_crime.columns.tolist())

total crime recs: 816028
['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON']


In [9]:
df_crime['Date Rptd'] = pd.to_datetime(df_crime['Date Rptd'], errors = 'coerce')
df_crime = df_crime[df_crime['Date Rptd'] >= '2022-01-01']

print("crime 2022-2024:", len(df_crime))

crime 2022-2024: 415031


  df_crime['Date Rptd'] = pd.to_datetime(df_crime['Date Rptd'], errors = 'coerce')


In [10]:
usc_areas = ['Central', 'Southwest', 'Olympic', 'Rampart']
df_crime = df_crime[df_crime['AREA NAME'].isin(usc_areas)]


print("crime recs now:", len(df_crime))

crime recs now: 95440


In [11]:
area_to_zip_mapping = {'Central': ['90007','90015','90017','90012', '90013', '90014','90021'],'Southwest':['90037','90018'],'Olympic': ['90006','90019','90020'],'Rampart': ['90026', '90057', '90005', '90004', '90010']}
expanded_rows = []
for index, row in df_crime.iterrows():
    area_name = row['AREA NAME']
    zip_codes_for_area = area_to_zip_mapping.get(area_name, [])
    for zip_code in zip_codes_for_area:
        new_row = {'area_name': area_name, 'zip_code': zip_code, 'crime_type': row['Crm Cd Desc'],'date': row['Date Rptd']}
        expanded_rows.append(new_row)

df_crime_clean = pd.DataFrame(expanded_rows)
print("crime records ledt:", len(df_crime_clean))


crime records ledt: 425466


In [12]:
output_file = '../data/processed/crime_clean.csv'
df_crime_clean.to_csv(output_file, index=False)

print("crime data:", len(df_crime_clean), output_file)

crime data: 425466 ../data/processed/crime_clean.csv
