### Import the relevant Libraries

In [3]:
import requests
import json

import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
import seaborn as sns

# Descriptive Statistics

In [19]:
# Read pickled file
data = pd.read_pickle('cleaned_real_estate_data.pkl')

In [21]:
data

Unnamed: 0,id,building,date_sale,type,property#,area,sold,customerid,price$,individual,birth_date,sex,country,state,purpose,deal_satisfaction,mortgage,source,full_name
0,1030,1,2005-11-01,apartment,30,743.09,1,C0028,246172.68,1.0,1986-06-21,1,USA,California,home,5.0,0.0,website,Madalyn Mercer
1,1029,1,2005-10-01,apartment,29,756.21,1,C0027,246331.90,1.0,1983-02-24,1,USA,California,home,5.0,0.0,website,Lara Carrillo
2,2002,2,2007-07-01,apartment,2,587.28,1,C0112,209280.91,1.0,1985-12-27,0,USA,California,home,1.0,1.0,client,Donavan Flowers
3,2031,2,2007-12-01,apartment,31,1604.75,1,C0160,452667.01,1.0,1985-12-27,0,USA,California,investment,3.0,1.0,website,Darien Dorsey
4,1049,1,2004-11-01,apartment,49,1375.45,1,C0014,467083.31,1.0,1979-05-15,1,USA,California,home,4.0,0.0,agency,Alessandra Perry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,5044,5,NaT,apartment,44,1238.58,0,,322610.74,,NaT,,,,,,,,
263,5047,5,NaT,apartment,47,794.52,0,,279191.26,,NaT,,,,,,,,
264,5048,5,NaT,apartment,48,1013.27,0,,287996.53,,NaT,,,,,,,,
265,5050,5,NaT,apartment,50,1074.71,0,,365868.78,,NaT,,,,,,,,


# Breakdowns by Building

We can examine:
1. Breakdown of totals by building (frequency distribution by building)
2. Breakdown of averages by building

In [24]:
# identify the unique buildings in the dataset.
data['building'].unique()

array(['1', '2', '3', '4', '5'], dtype=object)

In [26]:
# grouping the data by 'building'.
data.groupby('building')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x17c5c9110>

In [30]:
# However, there seems to be excessive information here, and the sum of certain variables,
# such as 'area', does not provide meaningful insights when aggregated.
# Therefore, it would make sense to select a subset of 'data' to aggregate.

### Breakdown of totals by Building

In [32]:
# create a variable to hold the columns of interest for a more structured approach
# This variable will include the column to group by and the columns to aggregate.
# the total number of properties sold and those under mortgage per building - these would represent total counts.
# Also, examine the average area, price, and deal satisfaction by building.

# getting total counts per building.
columns_of_interest = ['building', 'sold','mortgage']

# create a new variable to hold the total counts grouped by building.
totals_by_building = data[columns_of_interest].groupby("building").sum()
totals_by_building 

# Note that 'mortgage' here is a float, a side effect of the 'merge' from part1 ( cleaning and preprocessing).
# it does not affect the analysis

Unnamed: 0_level_0,sold,mortgage
building,Unnamed: 1_level_1,Unnamed: 2_level_1
1,46,14.0
2,54,18.0
3,53,15.0
4,23,9.0
5,19,6.0


### Breakdown of averages by building

In [36]:
# get average values. Here is the relevant list of columns.
# overwriting the 'columns_of_interest' variable.

# using this variable as a temporary holder of the list of columns of interest

columns_of_interest = ['building', 'area', 'price$', 'deal_satisfaction']

# create a new variable to hold the average values grouped by building.
averages_by_building = data[columns_of_interest].groupby("building").mean()
averages_by_building

Unnamed: 0_level_0,area,price$,deal_satisfaction
building,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,928.038846,275143.2425,3.630435
2,943.89193,286661.848246,3.518519
3,927.852381,280451.255556,3.566038
4,974.72093,290239.515581,3.869565
5,914.298654,274557.604615,3.526316


### Breakdowns by country and state

Country
1. Breakdown of totals by country (frequency distribution by country)
2. Breakdown of averages by country

State
3. Frequency distribution by state
4. Relative frequency by state
5. Cumulative frequency by state

#### Breakdown of totals by Country

In [54]:
# there is a duplication of the USA entries.
# It's logical to use the same 'columns_of_interest' as before for the totals.

columns_of_interest = ['country', 'sold','mortgage']

In [56]:
totals_by_country = data[columns_of_interest].groupby("country").sum()

In [58]:
totals_by_country

Unnamed: 0_level_0,sold,mortgage
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,2,0.0
Canada,7,0.0
Denmark,1,0.0
Germany,1,0.0
Mexico,1,0.0
Russia,4,1.0
UK,2,0.0
USA,177,61.0


In [60]:
# examine the unique values in the 'country' column.
data['country'].unique()

array(['USA', 'UK', 'Belgium', 'Russia', 'Denmark', 'Germany', 'Mexico',
       'Canada', nan], dtype=object)

In [62]:
# perform the cleanup operation on all columns at once.
object_columns = data.select_dtypes(['object']).columns


In [64]:
object_columns

Index(['id', 'building', 'type', 'property#', 'customerid', 'sex', 'country',
       'state', 'purpose', 'source', 'full_name'],
      dtype='object')

In [66]:
# selecting only the data from these columns and
# use the .apply() method to strip all white spaces from them simultaneously. 
data[object_columns] = data[object_columns].apply(lambda x: x.str.strip())

In [68]:
# A much cleaner data set!
totals_by_country = data[columns_of_interest].groupby("country").sum()


In [70]:
totals_by_country

Unnamed: 0_level_0,sold,mortgage
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,2,0.0
Canada,7,0.0
Denmark,1,0.0
Germany,1,0.0
Mexico,1,0.0
Russia,4,1.0
UK,2,0.0
USA,177,61.0


#### Breakdown of averages by country

Based on what you have seen before for the breakdowns by bulding and by state, please find the breakdown by country of the columns 'area', 'deal_satisfaction', and 'price$'

In [74]:
# For the average computations 
columns_of_interest = ['country', 'area', 'deal_satisfaction','price$']


In [76]:

# create a new variable where we will store the data relevant to our calculations.
averages_by_country = data[columns_of_interest].groupby("country").mean()
averages_by_country

Unnamed: 0_level_0,area,deal_satisfaction,price$
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,852.73,3.0,229075.47
Canada,917.382857,5.0,274069.384286
Denmark,785.48,1.0,257183.48
Germany,743.41,5.0,205098.21
Mexico,1283.45,3.0,338181.18
Russia,903.7575,2.5,278828.835
UK,739.48,4.0,220142.68
USA,900.794463,3.581921,270096.266554


#### Frequency distribution by State

In [79]:
columns_of_interest = ['state', 'sold','mortgage']


In [81]:
totals_by_state = data[columns_of_interest].groupby("state").sum()
totals_by_state

Unnamed: 0_level_0,sold,mortgage
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Arizona,11,3.0
California,120,41.0
Colorado,11,5.0
Kansas,1,0.0
Nevada,17,8.0
Oregon,11,1.0
Utah,5,1.0
Virginia,4,2.0
Wyoming,1,0.0


In [83]:
totals_by_state.sold.sum()

181

In [85]:
totals_by_country

Unnamed: 0_level_0,sold,mortgage
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,2,0.0
Canada,7,0.0
Denmark,1,0.0
Germany,1,0.0
Mexico,1,0.0
Russia,4,1.0
UK,2,0.0
USA,177,61.0


In [87]:
data['state'] = np.where(data['state']=='', pd.NA, data['state'])
data['state'] = np.where(data['country']!='USA', pd.NA, data['state'])

In [89]:
# here's the new result
totals_by_state = data[columns_of_interest].groupby("state").sum()
totals_by_state

Unnamed: 0_level_0,sold,mortgage
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Arizona,9,3.0
California,118,41.0
Colorado,11,5.0
Kansas,1,0.0
Nevada,17,8.0
Oregon,11,1.0
Utah,5,1.0
Virginia,4,2.0
Wyoming,1,0.0


In [91]:
totals_by_state.sold.sum()

177

In [93]:
# To focus on the relative and cumulative frequency of sales, refine the table by state.
# start by excluding 'mortgage' from the columns of interest.
columns_of_interest = ['state', 'sold']



In [95]:
# store this data in a new variable called 'sold_by_state'.
sold_by_state = data[columns_of_interest].groupby("state").sum()
sold_by_state

Unnamed: 0_level_0,sold
state,Unnamed: 1_level_1
Arizona,9
California,118
Colorado,11
Kansas,1
Nevada,17
Oregon,11
Utah,5
Virginia,4
Wyoming,1


In [97]:
# sort the values in descending order. This will place the states with the highest sales on top.
sold_by_state = sold_by_state.sort_values('sold', ascending=False)
sold_by_state

Unnamed: 0_level_0,sold
state,Unnamed: 1_level_1
California,118
Nevada,17
Colorado,11
Oregon,11
Arizona,9
Utah,5
Virginia,4
Kansas,1
Wyoming,1


In [99]:
# The term 'sold' may not be the best to describe frequency, so rename this column.
sold_by_state = sold_by_state.rename(columns={'sold':'frequency'})
sold_by_state

Unnamed: 0_level_0,frequency
state,Unnamed: 1_level_1
California,118
Nevada,17
Colorado,11
Oregon,11
Arizona,9
Utah,5
Virginia,4
Kansas,1
Wyoming,1


### Relative frequency distribution by State

In [102]:
sold_by_state['relative_frequency'] = sold_by_state['frequency']/sold_by_state['frequency'].sum()
sold_by_state

Unnamed: 0_level_0,frequency,relative_frequency
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,118,0.666667
Nevada,17,0.096045
Colorado,11,0.062147
Oregon,11,0.062147
Arizona,9,0.050847
Utah,5,0.028249
Virginia,4,0.022599
Kansas,1,0.00565
Wyoming,1,0.00565


In [104]:
# Cumulative frequency can be obtained using the 'cumsum()' function in pandas.
# This function calculates the cumulative sum of values in a Series.
# Applying this on our relative frequency column provides us with the cumulative frequency.
sold_by_state['cumulative_frequency'] = sold_by_state['relative_frequency'].cumsum()
sold_by_state

Unnamed: 0_level_0,frequency,relative_frequency,cumulative_frequency
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
California,118,0.666667,0.666667
Nevada,17,0.096045,0.762712
Colorado,11,0.062147,0.824859
Oregon,11,0.062147,0.887006
Arizona,9,0.050847,0.937853
Utah,5,0.028249,0.966102
Virginia,4,0.022599,0.988701
Kansas,1,0.00565,0.99435
Wyoming,1,0.00565,1.0
