In [None]:
# Loading CSV files into DataFrames
df1 = pd.read_csv("data/mexico-real-estate-1.csv")
df2 = pd.read_csv("data/mexico-real-estate-2.csv")
df3 = pd.read_csv("data/mexico-real-estate-3.csv")

# Printing object type and shape to have an overview of the data
print("df1 type:", type(df1))
print("df1 shape:", df1.shape)

print("df2 type:", type(df2))
print("df2 shape:", df2.shape)

print("df3 type:", type(df3))
print("df3 shape:", df3.shape)

In [None]:
# Deleting null values from df1
df1.dropna(inplace=True)


# Using replace to clean "price_usd" column in df1
df1["price_usd"] = (
    df1["price_usd"]
    .str.replace("$", "", regex=True)
    .str.replace(",", "")
    .astype(float)
)

df1.head()


In [None]:
# Deleting null values from df2
df2.dropna(inplace=True)


# Creating a new "price_usd" column for df2 (19 pesos to the dollar in 2014)
df2["price_usd"] = (df2["price_mxn"]/19).round(2)

#Deleting the column 'price_mxn'
df2.drop(columns=["price_mxn"], inplace=True)

df2.head()

In [None]:
# Deleting null values from df3
df3.dropna(inplace=True)

# Splitting "lat" and "lon" columns for df3
df3[["lat", "lon"]] = df3["lat-lon"].str.split(",", expand=True)

df3.head()

In [None]:
# Creating a new "state" column for df3
df3["state"] = df3["place_with_parent_names"].str.split("|", expand=True)[2]

# Deleting "place_with_parent_names" and "lat-lon" from df3 columns
df3.drop(columns=["place_with_parent_names", "lat-lon"], inplace=True)

df3.head()

In [None]:
# Concatenate the cleaned datasets df1, df2, and df3 to one dataframe
df = pd.concat([df1, df2, df3])
print(df.shape)

df.head()

In [None]:
# Save the new dataframe
df.to_csv("./data/mexico-real-estate-clean.csv", index=False)

In [None]:
# Import the new dataframe "data/mexico-real-estate-clean.csv"
df = pd.read_csv("data/Mexico-real-estate-clean.csv")

df.head()

In [None]:
# Using plotly express to create figure

fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat= "lat",
    lon= "lon",
    center={"lat": 19.43, "lon": -99.13},  # the Map will be centered on Mexico City
    width=600,  # Width of map
    height=600,  # Height of map
    hover_data=["price_usd", "state"],  # Displays price when hovering mouse over house
)

# Adding mapbox_style to figure layout
fig.update_layout(mapbox_style="open-street-map")

# Show the figure
fig.show()

In [None]:
# Getting the value counts of "state" column
df["state"].value_counts().head(10)

In [None]:
# Describing "area_m2", "price_usd" columns
df[["area_m2" , "price_usd",]].describe()

In [None]:
# Using Matplotlib to create histogram of "area_m2"
plt.hist(df["area_m2"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Frequency")
plt.title("Distribution of Home Sizes")

In [None]:
# Using Matplotlib to create boxplot of "area_m2"
plt.boxplot(df["area_m2"], vert=False)
plt.xlabel("area [sq_meters]")
plt.title("Distribution of homes sizes")

In [None]:
# Using Matplotlib to create histogram of "price_usd"
plt.hist(df["price_usd"])
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")
plt.title("Distribution of home prices")

In [None]:
# Using Matplotlib to create boxplot of "price_usd"
plt.boxplot(df["price_usd"], vert=False)
plt.xlabel("Price [USD]")


In [None]:
# Declaring the variable `mean_price_by_state`
mean_price_by_state = df.groupby("state")["price_usd"].mean().sort_values(ascending=False).round(2)
mean_price_by_state.head()

In [None]:
# Creating a bar chart from `mean_price_by_state` using pandas
mean_price_by_state.plot(
    kind="bar",
    xlabel="State",
    ylabel="Price [USD]",
    title="Mean house price by state"
)


In [None]:
# Creating "price_per_m2" column
df["price_per_m2"] = df["price_usd"] / df["area_m2"]
df.head()

In [None]:
# Grouping `df` by "state" & creating bar chart of "price_per_m2"
(
    df.groupby("state")["price_per_m2"].mean().sort_values(ascending=False)
    .plot(
        kind="bar",
        xlabel="State",
        ylabel="Mean Pric per M^2 [USD]",
        title="Mean house price per M^2by state"
    )
)

In [None]:
# Creating a scatter plot of "price_usd" vs "area_m2"
plt.scatter(x=df["area_m2"], y=df["price_usd"])
plt.xlabel = "Area [sq meters]",
plt.ylabel = "price [USD]"
plt.tilte = "Price vs Size"


In [None]:
# Calculating the correlation of "price_usd" and "area_m2"
p_correlation = df["area_m2"].corr(df["price_usd"])
print(p_correlation)
print("Correlation of 'area_m2' and 'price_usd' (all Mexico):", p_correlation)


In [None]:
# Subsetting 'df' to include only observations from "Morelos"
df_morelos = df[df["state"] == "Morelos"]
df_morelos.head()

In [None]:
# Creating scatter plot of "price_usd" vs "area_m2" in Morelos
plt.scatter(x=df_morelos["area_m2"], y=df_morelos["price_usd"])


In [None]:
# Calculating correlation of "price_usd" and "area_m2" in `df_morelos`
p_correlation = df_morelos["area_m2"].corr(df_morelos["price_usd"])
print("Correlation of 'area_m2' and 'price_usd' (Morelos):", p_correlation)

In [None]:
# Subset 'df' to include only observations from "Distrito Federal"
df_mexico_city = df[df["state"] == "Distrito Federal"]

# Creating a scatter plot "price_usd" vs "area_m2" in Distrito Federal
plt.scatter(df_mexico_city["area_m2"], df_mexico_city["price_usd"]) 
plt.xlabel("Area [sq meters]") 
plt.ylabel("Price [USD]")  
plt.title("Mexico City: Price vs. Area")  

# Calculating correlation of "price_usd" and "area_m2" in `df_mexico_city`
p_correlation = df_mexico_city["area_m2"].corr(df_mexico_city["price_usd"])

print("Correlation of 'area_m2' and 'price_usd' (Mexico City):", p_correlation)

In [None]:
#Looking at the scatter plot and correlation coefficient of 0.41, there is a weak relationship between size and price.One interpretation is that the relationship we see between size and price in many states doesn't hold true in the country's biggest and most economically powerful urban center because there are other factors that have a larger influence on price