In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('../data/processed/processed_synthetic.csv')

# Display basic information
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 28 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   City                          250000 non-null  object 
 1   Zipcode                       250000 non-null  int64  
 2   Marital_Status                250000 non-null  object 
 3   Family_Size                   250000 non-null  int64  
 4   Credit_Score                  250000 non-null  int64  
 5   Current_Residence             250000 non-null  object 
 6   Annual_Income                 250000 non-null  float64
 7   DTI                           250000 non-null  float64
 8   Location                      250000 non-null  object 
 9   Square_Footage                250000 non-null  float64
 10  Upgrade_Score                 250000 non-null  int64  
 11  Bedroom_Count                 250000 non-null  int64  
 12  Bathroom_Count                250000 non-nul

In [13]:
df.columns = df.columns.str.lower().str.replace(' ', '_') # replace spaces with underscores in column names to avoid errors.
df.columns

Index(['city', 'zipcode', 'marital_status', 'family_size', 'credit_score',
       'current_residence', 'annual_income', 'dti', 'location',
       'square_footage', 'upgrade_score', 'bedroom_count', 'bathroom_count',
       'master_location', 'lot_size', 'closing_date', 'appraised_value',
       'price', 'closing_interest_rate', 'monthly_payment', 'loan_term',
       'feedback', 'inflation_rate', 'unemployment_rate',
       'current_market_interest_rate', 'year_closed',
       'median_price_of_nearby_homes', 'final_sentiment_score'],
      dtype='object')

In [15]:
# Identify numerical and categorical features
numerical_features = df.select_dtypes(include=[np.number]).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Histogram for numerical features
df[numerical_features].hist(figsize=(20, 15), bins=50)
plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.close()

# Bar plots for categorical features
for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    df[feature].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {feature}')
    plt.savefig(f'{feature}_distribution.png')
    plt.close()

print("Numerical features:", list(numerical_features))
print("Categorical features:", list(categorical_features))

Numerical features: ['zipcode', 'family_size', 'credit_score', 'annual_income', 'dti', 'square_footage', 'upgrade_score', 'bedroom_count', 'bathroom_count', 'lot_size', 'appraised_value', 'price', 'closing_interest_rate', 'monthly_payment', 'loan_term', 'inflation_rate', 'unemployment_rate', 'current_market_interest_rate', 'year_closed', 'median_price_of_nearby_homes', 'final_sentiment_score']
Categorical features: ['city', 'marital_status', 'current_residence', 'location', 'master_location', 'closing_date', 'feedback']


In [18]:
# Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True)
plt.title('Distribution of House Prices')
plt.savefig('price_distribution.png')
plt.close()

# Summary statistics of Price
print(df['price'].describe())

# Box plot of Price vs. categorical features
for feature in categorical_features:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=feature, y='price', data=df)
    plt.xticks(rotation=45)
    plt.title(f'Price vs {feature}')
    plt.savefig(f'price_vs_{feature}.png')
    plt.close()

count     250000.000000
mean      358325.313486
std       174938.172860
min       100000.000000
25%       211435.901269
50%       350140.597993
75%       491728.199324
max      1000000.000000
Name: price, dtype: float64


In [19]:
# Correlation matrix for numerical features
correlation_matrix = df[numerical_features].corr()
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.savefig('correlation_matrix.png')
plt.close()

# Print high correlations
high_correlations = np.where(np.abs(correlation_matrix) > 0.7)
high_correlations = [(correlation_matrix.index[x], correlation_matrix.columns[y], correlation_matrix.iloc[x, y]) 
                     for x, y in zip(*high_correlations) if x != y and x < y]
print("High correlations (>0.7):")
for corr in high_correlations:
    print(f"{corr[0]} - {corr[1]}: {corr[2]:.2f}")

High correlations (>0.7):
annual_income - square_footage: 0.94
annual_income - price: 0.86
square_footage - price: 0.91


In [21]:
# Scatter plots for key numerical features vs Price
key_features = ['square_footage', 'upgrade_score', 'credit_score', 'annual_income', 'dti', 'appraised_value']
for feature in key_features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=feature, y='price', data=df)
    plt.title(f'{feature} vs Price')
    plt.savefig(f'{feature}_vs_price.png')
    plt.close()

# Print correlations with Price
price_correlations = correlation_matrix['price'].sort_values(ascending=False)
print("Correlations with Price:")
print(price_correlations)

Correlations with Price:
price                           1.000000
square_footage                  0.914480
annual_income                   0.856267
bedroom_count                   0.005077
inflation_rate                  0.002423
monthly_payment                 0.001912
zipcode                         0.001452
upgrade_score                   0.001426
credit_score                    0.001281
family_size                     0.000881
final_sentiment_score           0.000590
median_price_of_nearby_homes    0.000512
year_closed                     0.000198
current_market_interest_rate   -0.000117
loan_term                      -0.000244
bathroom_count                 -0.000460
appraised_value                -0.002365
lot_size                       -0.002473
dti                            -0.003162
unemployment_rate              -0.003395
closing_interest_rate          -0.004068
Name: price, dtype: float64


In [22]:
df.to_csv('../data/processed/processed_synthetic_cleaned.csv', index=False)