In [None]:
# Texas Real Estate Trends

In [None]:
## Importing Libraries

In [None]:
import pandas as pd

In [None]:
## Loading Data

In [None]:
df = pd.read_csv("real_estate_texas_500_2024.csv")

# will display the first few rows
df.head()

In [None]:
# trying to figure out how many rows and columns
df.shape

In [None]:
# column names, data types, missing info
df.info()

In [None]:
# checking for the sum of null values in each column
df.isnull().sum()

In [None]:
!python3 -m pip install seaborn

In [None]:
!pip show seaborn

In [None]:
import sys
print(sys.executable)

In [None]:
# realized from this that sub_type is almost entirely missing, and year_built almost. can drop sub_type, but need to check if year_built is important enough to save
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5)) # just an estimate of what will make the plot wide enough and easy to read
sns.boxplot(x=df["year_built"], y=df["listPrice"]) # good for visualizing distribution of num data across different categories (good with outliers, shows distribution, easy to compare)
plt.xticks(rotation=90) # years can be close together, can rotate to make more readable
plt.title("Relationship Between Year Built and Price")
plt.show()

In [None]:
# boxplot shows that year_built does matter in determining price, but we have 212 missing values, so what do we do?
# can use the mode of the year for the missing values

df["year_built"].mode()

In [None]:
# comparing to see which one will work better for this situation
print("Mode:", df["year_built"].mode()[0])
print("Median:", df["year_built"].median())

In [None]:
# from this we can tell hat the mode not trustworthy, its the current year, so we'll go with median
df["year_built"].isnull().sum()

In [None]:
# Fill missing numerical values with the median
df["beds"].fillna(df["beds"].median(), inplace=True)
df["baths_full"].fillna(df["baths_full"].median(), inplace=True)
df["sqft"].fillna(df["sqft"].median(), inplace=True)
df["stories"].fillna(df["stories"].median(), inplace=True)

In [None]:
# Check for any remaining missing values
df.isnull().sum()

In [None]:
# List all column names
print(df.columns)

In [None]:
# missing too many things
# Drop column only if it exists
if "sub_type" in df.columns:
    df.drop(columns=["sub_type"], inplace=True)

# filling again
df["year_built"].fillna(df["year_built"].median(), inplace=True)
df["listPrice"].fillna(df["listPrice"].median(), inplace=True)
df["baths_full_calc"].fillna(df["baths_full"], inplace=True)

# handle text
df["text"].fillna("No description available", inplace=True)

df.isnull().sum()

In [None]:
# next we need to figure out outliers
Q1 = df["listPrice"].quantile(0.25)
Q3 = df["listPrice"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier threshold
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
outliers = df[(df["listPrice"] < lower_bound) | (df["listPrice"] > upper_bound)]
print("Number of outliers detected:", outliers.shape[0])

In [None]:
# Log Transformations

# Detect outliers in listPrice
Q1 = df["listPrice"].quantile(0.25)
Q3 = df["listPrice"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier threshold
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
outliers = df[(df["listPrice"] < lower_bound) | (df["listPrice"] > upper_bound)]
print("Number of outliers detected:", outliers.shape[0])

In [None]:
# Visualize
import matplotlib.pyplot as plt
import seaborn as sns

# Before transformation
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df["listPrice"], bins=50, kde=True)
plt.title("Original Price Distribution")

# After transformation
plt.subplot(1, 2, 2)
sns.histplot(df["listPrice_log"], bins=50, kde=True)
plt.title("Log-Transformed Price Distribution")

plt.show()