In [25]:
import pandas as pd
import os

# Define the directory containing the CSV files
directory = 'resources'

# Prepare an empty list to collect dataframes
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the zip code and type of home from the filename
        zip_code = filename[:5]
        if 'SF' in filename:
            home_type = 'Single Family'
        elif 'CONDO' in filename:
            home_type = 'Condo'

        # Load the CSV file into a DataFrame
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)

        # Add the zip code and home type as new columns
        df['Zip Code'] = zip_code
        df['Type of Home'] = home_type

        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('data.csv', index=False)

print("All files have been successfully merged and saved to 'data.csv'.")

combined_df.head()


All files have been successfully merged and saved to 'data.csv'.


Unnamed: 0,#,Listing ID,St,MLS Area,Address,# Beds,# Full Baths,# Half Baths,# Garage Spaces,Levels,...,SqFt,List Price,LP$/SqFt,Close$/SqFt,Close Price,Close Date,DOM,CDOM,Zip Code,Type of Home
0,1,6996515,C,HU,21608 Wilmiller Ln,4,2,1,2.0,2,...,2581,"$435,000",$168.54,$169.31,"$437,000",03/05/2024,4,4,78660,Single Family
1,1,1659174,C,PF,808 Whitehall Dr,3,2,0,2.0,1,...,1299,"$275,000",$211.70,$184.76,"$240,000",12/29/2023,8,7,78660,Single Family
2,1,5349770,C,PF,16131 Stoneham Cir,3,2,0,1.0,1,...,1033,"$265,000",$256.53,$242.01,"$250,000",11/14/2023,20,20,78660,Single Family
3,1,7256633,C,PF,17228 Sandwick Dr,3,2,0,2.0,1,...,2046,"$235,000",$114.86,$123.66,"$253,000",01/30/2024,4,4,78660,Single Family
4,1,8113439,C,PF,15520 Sutton Leighs Ln,3,2,0,2.0,1,...,1616,"$398,000",$246.29,$160.89,"$260,000",10/30/2023,48,48,78660,Single Family


In [19]:
combined_df[combined_df['Type of Home'] == 'Single Family'].shape

(2074, 22)

In [20]:
combined_df[combined_df['Type of Home'] == 'Condo'].shape

(100, 22)

In [22]:
most_common_zip = combined_df['Zip Code'].value_counts().idxmax()
most_common_zip


'78660'

In [28]:
zip_code_counts = combined_df['Zip Code'].value_counts().reset_index()
zip_code_counts.columns = ['Zip Code', 'Count']
zip_code_counts.head(100)


Unnamed: 0,Zip Code,Count
0,78660,517
1,78642,455
2,78666,408
3,78620,232
4,78744,206
5,78664,166
6,78746,99
7,78701,61
8,78758,30


In [27]:
unique_zip_codes = combined_df['Zip Code'].unique()
print(unique_zip_codes)


['78660' '78744' '78746' '78701' '78642' '78666' '78664' '78758' '78620']


In [29]:
pd.read_csv('data.csv').columns

Index(['#', 'Listing ID', 'St', 'MLS Area', 'Address', '# Beds',
       '# Full Baths', '# Half Baths', '# Garage Spaces', 'Levels',
       'Year Built', 'Acres', 'SqFt', 'List Price', 'LP$/SqFt', 'Close$/SqFt',
       'Close Price', 'Close Date', 'DOM', 'CDOM', 'Zip Code', 'Type of Home'],
      dtype='object')