In [None]:
!pip install matplotlib pandas plotly

In [None]:
# Import Matplotlib, pandas, and plotly
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

In [None]:
df1 = pd.read_csv("brazil-real-estate-1.csv")
df1.head()

In [None]:
df1.info()

In [None]:
df1.dropna(inplace=True)

In [None]:
df1[['lat', 'lon']] = df1['lat-lon'].str.split(",", expand=True).astype("float")

df1.head()

In [None]:
df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2]

df1.head()

In [None]:
df1["price_usd"] = df1["price_usd"].str.replace("$","", regex=False).str.replace(",","", regex=False).astype("float")

df1.head()
df1.info()

In [None]:
df1.drop(columns = ['lat-lon', 'place_with_parent_names'], inplace=True)


In [None]:
df2 = pd.read_csv("brazil-real-estate-2.csv")

df2.head()

In [None]:
df2.info()

In [None]:
df2["price_usd"] = (df2["price_brl"]/3.19).round(2)

df2.head()

In [None]:
df2.dropna(inplace=True)

df2.drop(columns=["price_brl"], inplace=True)

In [None]:
df = pd.concat([df1,df2], axis=0)
print("df shape:", df.shape)

In [None]:
fig = px.scatter_mapbox(
    df,
    lat='lat',
    lon='lon',
    center={"lat": -14.2, "lon": -51.9},  # Map will be centered on Brazil
    width=600,
    height=600,
    hover_data=["price_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [None]:
summary_stats = df[["area_m2", "price_usd"]].describe()
summary_stats

In [None]:
# Build histogram
plt.hist(df["price_usd"]);

# Label axes
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")

# Add title
plt.title("Distribution of Home Prices");

# Don't change the code below 👇
plt.savefig("images/1-5-12.png", dpi=150)


In [None]:
# Build box plot
plt.boxplot(df["area_m2"], vert=False);


# Label x-axis
plt.xlabel("Area [sq meters]")

# Add title
plt.title("Distribution of Home Sizes");

# Don't change the code below 👇
plt.savefig("images/1-5-13.png", dpi=150)


In [None]:
mean_price_by_region = df.groupby('region')['price_usd'].mean().sort_values(ascending=True)
mean_price_by_region

In [None]:
# Build bar chart, label axes, add title
mean_price_by_region.plot(kind="bar",
                         xlabel="Region",
                         ylabel="Mean Price [USD]",
                         title="Mean Home Price by Region");

# Don't change the code below 👇
plt.savefig("images/1-5-15.png", dpi=150)


In [None]:
df_south = df[df["region"] == "South"]
df_south.head()

In [None]:
homes_by_state = df_south["state"].value_counts()
homes_by_state

In [None]:
# Subset data
df_south_rgs = df[df["state"] == "Rio Grande do Sul"]

# Build scatter plot
plt.scatter(x=df_south_rgs["area_m2"],
           y=df_south_rgs["price_usd"]);

# Label axes
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")

# Add title
plt.title("Rio Grande do Sul: Price vs. Area")

# Don't change the code below 👇
plt.savefig("images/1-5-18.png", dpi=150)


In [None]:
state_group = df_south.groupby('state')

south_states_corr = {}

for state, group in state_group:
    correlation = group['price_usd'].corr(group['area_m2'])
    south_states_corr[state] = correlation

south_states_corr