In [None]:
# Load CSV files into DataFrames
df1 = pd.read_csv("data/mexico-real-estate-1.csv")
df2 = pd.read_csv("data/mexico-real-estate-2.csv")
df3 = pd.read_csv("data/mexico-real-estate-3.csv")

# Print object type and shape for DataFrames
print("df1 type:", type(df1))
print("df1 shape:", df1.shape)
print()
print("df2 type:", type(df2))
print("df2 shape:", df2.shape)
print()
print("df3 type:", type(df3))
print("df3 shape:", df3.shape)

In [None]:
# Drop null values from df1
df1.dropna(inplace=True)


# Clean "price_usd" column in df1
df1["price_usd"] = (
    df1["price_usd"]
    .str.replace("$", "", regex=True)
    .str.replace(",", "")
    .astype(float)
)

df1.head()


In [None]:
# Drop null values from df2
df2.dropna(inplace=True)


# Create "price_usd" column for df2 (19 pesos to the dollar in 2014)
df2["price_usd"] = (df2["price_mxn"]/19).round(2)

#Remove the column 'price_mxn'
df2.drop(columns=["price_mxn"], inplace=True)

df2.head()

In [None]:
# Drop null values from df3
df3.dropna(inplace=True)

# Create "lat" and "lon" columns for df3
df3[["lat", "lon"]] = df3["lat-lon"].str.split(",", expand=True)

df3.head()

In [None]:
# Create "state" column for df3
df3["state"] = df3["place_with_parent_names"].str.split("|", expand=True)[2]

# Drop "place_with_parent_names" and "lat-lon" from df3
df3.drop(columns=["place_with_parent_names", "lat-lon"], inplace=True)

df3.head()

In [None]:
# Concatenate df1, df2, and df3
df = pd.concat([df1, df2, df3])
print(df.shape)

df.head()

In [None]:
# Save df
df.to_csv("./data/mexico-real-estate-clean.csv", index=False)

In [None]:
# Import "data/mexico-real-estate-clean.csv"
df = pd.read_csv("data/Mexico-real-estate-clean.csv")

df.head()

In [None]:
# Use plotly express to create figure

fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat= "lat",
    lon= "lon",
    center={"lat": 19.43, "lon": -99.13},  # Map will be centered on Mexico City
    width=600,  # Width of map
    height=600,  # Height of map
    hover_data=["price_usd", "state"],  # Display price when hovering mouse over house
)

# Add mapbox_style to figure layout
fig.update_layout(mapbox_style="open-street-map")

# Show figure
fig.show()

In [None]:
# Get value counts of "state" column
df["state"].value_counts().head(10)

In [None]:
# Describe "area_m2", "price_usd" columns
df[["area_m2" , "price_usd",]].describe()

In [None]:
# Use Matplotlib to create histogram of "area_m2"
plt.hist(df["area_m2"])

# Add x-axis label
plt.xlabel("Area [sq meters]")

# Add y-axis label
plt.ylabel("Frequency")

# Add title
plt.title("Distribution of Home Sizes")

In [None]:
# Use Matplotlib to create boxplot of "area_m2"
plt.boxplot(df["area_m2"], vert=False)

# Add x-axis label
plt.xlabel("area [sq_meters]")

# Add title
plt.title("Distribution of homes sizes")

In [None]:
# Use Matplotlib to create histogram of "price_usd"
plt.hist(df["price_usd"])

# Add x-axis label
plt.xlabel("Price [USD]")

# Add y-axis label
plt.ylabel("Frequency")

# Add title
plt.title("Distribution of home prices")

In [None]:
# Use Matplotlib to create boxplot of "price_usd"
plt.boxplot(df["price_usd"], vert=False)

# Add x-label axis
plt.xlabel("Price [USD]")


# Add y-label axis

In [None]:
# Declare variable `mean_price_by_state`
#df.groupby("state")["price_usd"].mean().sort_values(ascending=False).round(2)
mean_price_by_state = df.groupby("state")["price_usd"].mean().sort_values(ascending=False).round(2)

# Print object type, shape, and head
print("mean_price_by_state type:", type(mean_price_by_state))
print("mean_price_by_state shape:", mean_price_by_state.shape)
mean_price_by_state.head()

In [None]:
# Create bar chart from `mean_price_by_state` using pandas
mean_price_by_state.plot(
    kind="bar",
    xlabel="State",
    ylabel="Price [USD]",
    title="Mean house price by state"
)


In [None]:
# Create "price_per_m2" column
df["price_per_m2"] = df["price_usd"] / df["area_m2"]

# Print object type, shape, and head
print("df type:", type(df))
print("df shape:", df.shape)
df.head()

In [None]:
# Group `df` by "state", create bar chart of "price_per_m2"
(
    df.groupby("state")["price_per_m2"].mean().sort_values(ascending=False)
    .plot(
        kind="bar",
        xlabel="State",
        ylabel="Mean Pric per M^2 [USD]",
        title="Mean house price per M^2by state"
    )
)

In [None]:
# Create scatter plot of "price_usd" vs "area_m2"
plt.scatter(x=df["area_m2"], y=df["price_usd"])

# Add x-axis label
plt.xlabel = "Area [sq meters]",

# Add y-axis label
plt.ylabel = "price [USD]"


# Add title
plt.tilte = "Price vs Size"


In [None]:
# Calculate correlation of "price_usd" and "area_m2"
p_correlation = df["area_m2"].corr(df["price_usd"])

# Print correlation coefficient
#print("Correlation of 'area_m2' and 'price_usd' (all Mexico):", p_correlation)
print(p_correlation)

In [None]:
# Declare variable `df_morelos` by subsetting `df`
df_morelos = df[df["state"] == "Morelos"]

# Print object type, shape, and head
print("df_morelos type:", type(df_morelos))
print("df_morelos shape:", df_morelos.shape)
df_morelos.head()

In [None]:
# Create scatter plot of "price_usd" vs "area_m2" in Morelos
plt.scatter(x=df_morelos["area_m2"], y=df_morelos["price_usd"])


In [None]:
# Calculate correlation of "price_usd" and "area_m2" in `df_morelos`
p_correlation = df_morelos["area_m2"].corr(df_morelos["price_usd"])

# Print correlation coefficient
print("Correlation of 'area_m2' and 'price_usd' (Morelos):", p_correlation)

In [None]:
# Declare variable `df_mexico_city` by subsetting `df`
df_mexico_city = ...

# Print object type and shape
print("df_mexico_city type:", type(df_mexico_city))
print("df_mexico_city shape:", df_mexico_city.shape)

# Create a scatter plot "price_usd" vs "area_m2" in Distrito Federal
plt.scatter(df_mexico_city["area_m2"], df_mexico_city["price_usd"])  # REMOVERHS

# Add x-axis label
plt.xlabel("Area [sq meters]")  # REMOVERHS

# Add y-axis label
plt.ylabel("Price [USD]")  # REMOVERHS

# Add title
plt.title("Mexico City: Price vs. Area")  # REMOVERHS

# Calculate correlation of "price_usd" and "area_m2" in `df_mexico_city`
p_correlation = ...

# Print correlation coefficient
print("Correlation of 'area_m2' and 'price_usd' (Mexico City):", p_correlation)

In [None]:
Looking at the scatter plot and correlation coefficient, there's see a weak relationship between size and price. How should we interpret this?

One interpretation is that the relationship we see between size and price in many states doesn't hold true in the country's biggest and most economically powerful urban center because there are other factors that have a larger influence on price