### Import the relevant Libraries

In [5]:
import requests
import json

import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
import warnings
warnings.filterwarnings('ignore')


# Data Analysis

In [10]:
# Read pickled file
data = pd.read_pickle('cleaned_real_estate_data.pkl')

### Customers Age

In [13]:
data.head()

Unnamed: 0,id,building,date_sale,type,property#,area,sold,customerid,price$,individual,birth_date,sex,country,state,purpose,deal_satisfaction,mortgage,source,full_name
0,1030,1,2005-11-01,apartment,30,743.09,1,C0028,246172.68,1.0,1986-06-21,1,USA,California,home,5.0,0.0,website,Madalyn Mercer
1,1029,1,2005-10-01,apartment,29,756.21,1,C0027,246331.9,1.0,1983-02-24,1,USA,California,home,5.0,0.0,website,Lara Carrillo
2,2002,2,2007-07-01,apartment,2,587.28,1,C0112,209280.91,1.0,1985-12-27,0,USA,California,home,1.0,1.0,client,Donavan Flowers
3,2031,2,2007-12-01,apartment,31,1604.75,1,C0160,452667.01,1.0,1985-12-27,0,USA,California,investment,3.0,1.0,website,Darien Dorsey
4,1049,1,2004-11-01,apartment,49,1375.45,1,C0014,467083.31,1.0,1979-05-15,1,USA,California,home,4.0,0.0,agency,Alessandra Perry


In [15]:
# calculate the age of the buyers at the time of purchase.
# This is done by subtracting the birth date from the sale date.
data['age_at_purchase'] = data['date_sale'] - data['birth_date']
data['age_at_purchase']

0     7073 days
1     8255 days
2     7856 days
3     8009 days
4     9302 days
         ...   
262         NaT
263         NaT
264         NaT
265         NaT
266         NaT
Name: age_at_purchase, Length: 267, dtype: timedelta64[ns]

In [17]:
#data['age_at_purchase'][0].days
type(data['age_at_purchase'][0].days)

int

In [19]:
# the 'age_at_purchase' field we've just created is of timedelta datatype.
# A timedelta object represents a duration, the difference between two dates or times.
# For this analysis, we need the age in integers or floats for better manipulation.
# To convert it, use the 'days' attribute to extract the number of days, as timedelta measures duration in days.

data['age_at_purchase'] = data['age_at_purchase'].apply(lambda x: x.days)
data['age_at_purchase']

0      7073.0
1      8255.0
2      7856.0
3      8009.0
4      9302.0
        ...  
262       NaN
263       NaN
264       NaN
265       NaN
266       NaN
Name: age_at_purchase, Length: 267, dtype: float64

In [21]:
# To obtain the age in years at the time of purchase, I divide the 'age_at_purchase' field by 365.
# This may not be accurate due to leap years, but should be close enough for my analysis.
data['age_at_purchase'] = data['age_at_purchase']/365
data['age_at_purchase']

0      19.378082
1      22.616438
2      21.523288
3      21.942466
4      25.484932
         ...    
262          NaN
263          NaN
264          NaN
265          NaN
266          NaN
Name: age_at_purchase, Length: 267, dtype: float64

In [23]:
# we round down the values using np.floor() since age is typically represented in whole numbers.
data['age_at_purchase_rounded'] = data['age_at_purchase'].apply(lambda x: np.floor(x))
data['age_at_purchase_rounded']

0      19.0
1      22.0
2      21.0
3      21.0
4      25.0
       ... 
262     NaN
263     NaN
264     NaN
265     NaN
266     NaN
Name: age_at_purchase_rounded, Length: 267, dtype: float64

#### Create Age Intervals

In [29]:
# I keep the original values when calculating statistics, as they are more precise.
# To divide age into different intervals, we can use the 'cut' method from pandas.

# dividing the 'age_at_purchase' into 10 bins and setting the decimal precision to 0.
data['age_interval'] = pd.cut(data['age_at_purchase'], bins = 10, precision = 0)
data['age_interval']

0      (19.0, 25.0]
1      (19.0, 25.0]
2      (19.0, 25.0]
3      (19.0, 25.0]
4      (25.0, 31.0]
           ...     
262             NaN
263             NaN
264             NaN
265             NaN
266             NaN
Name: age_interval, Length: 267, dtype: category
Categories (10, interval[float64, right]): [(19.0, 25.0] < (25.0, 31.0] < (31.0, 36.0] < (36.0, 42.0] ... (54.0, 59.0] < (59.0, 65.0] < (65.0, 71.0] < (71.0, 76.0]]

### Breakdown by Age Interval

In [32]:
columns_of_interest = ['age_interval', 'sold']


In [38]:
sold_by_age = data[columns_of_interest].groupby("age_interval").sum()
sold_by_age

Unnamed: 0_level_0,sold
age_interval,Unnamed: 1_level_1
"(19.0, 25.0]",4
"(25.0, 31.0]",16
"(31.0, 36.0]",26
"(36.0, 42.0]",33
"(42.0, 48.0]",22
"(48.0, 54.0]",22
"(54.0, 59.0]",22
"(59.0, 65.0]",11
"(65.0, 71.0]",16
"(71.0, 76.0]",6


## Analysis of the Price of Properties

### Price Interval

In [42]:
# To create 'price_interval', we adopt a similar process to that of 'age_interval'.
# segment and sort the 'price$' values into 10 bins.
data['price_interval'] = pd.cut(data['price$'], bins=10)
data['price_interval']

0      (243776.371, 285847.138]
1      (243776.371, 285847.138]
2      (201705.604, 243776.371]
3      (412059.439, 454130.206]
4      (454130.206, 496200.973]
                 ...           
262    (285847.138, 327917.905]
263    (243776.371, 285847.138]
264    (285847.138, 327917.905]
265    (327917.905, 369988.672]
266    (159634.837, 201705.604]
Name: price_interval, Length: 267, dtype: category
Categories (10, interval[float64, right]): [(117143.362, 159634.837] < (159634.837, 201705.604] < (201705.604, 243776.371] < (243776.371, 285847.138] ... (369988.672, 412059.439] < (412059.439, 454130.206] < (454130.206, 496200.973] < (496200.973, 538271.74]]

### Total number of Properties

In [45]:
# count the total number of properties for each price interval.
columns_of_interest = ['price_interval', 'sold']
all_properties_by_price = data[columns_of_interest].groupby("price_interval").count()

In [47]:
# 'sold' simply represents a count, so it's idea to rename the column as 'count'.
all_properties_by_price = all_properties_by_price.rename(columns={'sold':'count'})
all_properties_by_price

Unnamed: 0_level_0,count
price_interval,Unnamed: 1_level_1
"(117143.362, 159634.837]",3
"(159634.837, 201705.604]",32
"(201705.604, 243776.371]",88
"(243776.371, 285847.138]",47
"(285847.138, 327917.905]",31
"(327917.905, 369988.672]",18
"(369988.672, 412059.439]",21
"(412059.439, 454130.206]",7
"(454130.206, 496200.973]",11
"(496200.973, 538271.74]",9


### Total Number of Sold Properties

In [50]:
# For the sold properties, I count the number of properties sold for each price interval.
# use the 'sum()' function with 'groupby' as 'sold' here represents properties sold.
columns_of_interest = ['price_interval', 'sold']


In [52]:
sold_properties_by_price = data[columns_of_interest].groupby("price_interval").sum()
sold_properties_by_price

Unnamed: 0_level_0,sold
price_interval,Unnamed: 1_level_1
"(117143.362, 159634.837]",2
"(159634.837, 201705.604]",28
"(201705.604, 243776.371]",68
"(243776.371, 285847.138]",34
"(285847.138, 327917.905]",24
"(327917.905, 369988.672]",12
"(369988.672, 412059.439]",13
"(412059.439, 454130.206]",4
"(454130.206, 496200.973]",7
"(496200.973, 538271.74]",3


### Total number of not sold Properties

In [55]:
# To identify properties that remain unsold, I subtract the sold properties from the total count.
# save this in a new column named 'not_sold' in the original dataframe.
all_properties_by_price['not_sold'] = all_properties_by_price['count'] - sold_properties_by_price['sold']
all_properties_by_price['sold'] = sold_properties_by_price['sold']


In [57]:
all_properties_by_price

Unnamed: 0_level_0,count,not_sold,sold
price_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(117143.362, 159634.837]",3,1,2
"(159634.837, 201705.604]",32,4,28
"(201705.604, 243776.371]",88,20,68
"(243776.371, 285847.138]",47,13,34
"(285847.138, 327917.905]",31,7,24
"(327917.905, 369988.672]",18,6,12
"(369988.672, 412059.439]",21,8,13
"(412059.439, 454130.206]",7,3,4
"(454130.206, 496200.973]",11,4,7
"(496200.973, 538271.74]",9,6,3


## Relationship between Age and Price

In [60]:
data

Unnamed: 0,id,building,date_sale,type,property#,area,sold,customerid,price$,individual,...,state,purpose,deal_satisfaction,mortgage,source,full_name,age_at_purchase,age_at_purchase_rounded,age_interval,price_interval
0,1030,1,2005-11-01,apartment,30,743.09,1,C0028,246172.68,1.0,...,California,home,5.0,0.0,website,Madalyn Mercer,19.378082,19.0,"(19.0, 25.0]","(243776.371, 285847.138]"
1,1029,1,2005-10-01,apartment,29,756.21,1,C0027,246331.90,1.0,...,California,home,5.0,0.0,website,Lara Carrillo,22.616438,22.0,"(19.0, 25.0]","(243776.371, 285847.138]"
2,2002,2,2007-07-01,apartment,2,587.28,1,C0112,209280.91,1.0,...,California,home,1.0,1.0,client,Donavan Flowers,21.523288,21.0,"(19.0, 25.0]","(201705.604, 243776.371]"
3,2031,2,2007-12-01,apartment,31,1604.75,1,C0160,452667.01,1.0,...,California,investment,3.0,1.0,website,Darien Dorsey,21.942466,21.0,"(19.0, 25.0]","(412059.439, 454130.206]"
4,1049,1,2004-11-01,apartment,49,1375.45,1,C0014,467083.31,1.0,...,California,home,4.0,0.0,agency,Alessandra Perry,25.484932,25.0,"(25.0, 31.0]","(454130.206, 496200.973]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,5044,5,NaT,apartment,44,1238.58,0,,322610.74,,...,,,,,,,,,,"(285847.138, 327917.905]"
263,5047,5,NaT,apartment,47,794.52,0,,279191.26,,...,,,,,,,,,,"(243776.371, 285847.138]"
264,5048,5,NaT,apartment,48,1013.27,0,,287996.53,,...,,,,,,,,,,"(285847.138, 327917.905]"
265,5050,5,NaT,apartment,50,1074.71,0,,365868.78,,...,,,,,,,,,,"(327917.905, 369988.672]"


### Filter out only the Sold Apartments 

In [63]:
# Next I filter out the properties that have been sold.
# This will be done in a new variable by filtering the 'sold' column for the value of 1.
data_sold = data[data['sold']==1]

In [65]:
# Also exclude any company data, focusing only on individual sales.
data_sold = data_sold[data_sold['individual']==1]

In [67]:
# Narrow down to the 'age_at_purchase' and 'price$' columns.
data_sold[['age_at_purchase','price$']]

Unnamed: 0,age_at_purchase,price$
0,19.378082,246172.68
1,22.616438,246331.90
2,21.523288,209280.91
3,21.942466,452667.01
4,25.484932,467083.31
...,...,...
173,70.846575,204292.49
174,72.849315,261579.89
175,73.038356,222867.42
176,72.778082,291494.36


### Covariance of Age and Price

In [70]:
# calculate the covariance of 'age_at_purchase' and 'price$'.
# not really needed for my insights
np.cov(data_sold['age_at_purchase'], data_sold['price$'])

array([[ 1.68344293e+02, -1.77726142e+05],
       [-1.77726142e+05,  6.16619957e+09]])

### Correlation of Age and Price

In [73]:
# correlation is ideal, which can be calculated as follows.
np.corrcoef(data_sold['age_at_purchase'], data_sold['price$'])

array([[ 1.        , -0.17443889],
       [-0.17443889,  1.        ]])

In [75]:
# drop any rows with missing values and store this in a new variable 'data_sold_no_na'.
data_sold_no_na = data_sold.dropna()
np.corrcoef(data_sold_no_na['age_at_purchase'], data_sold_no_na['price$'])#, ddof=1)

array([[ 1.        , -0.18945818],
       [-0.18945818,  1.        ]])

In [77]:
# alternatively, I use the correlation method provided by pandas, which gives the same result.
data_sold_no_na[['age_at_purchase','price$']].corr()

Unnamed: 0,age_at_purchase,price$
age_at_purchase,1.0,-0.189458
price$,-0.189458,1.0


In [79]:
# pandas also handles NA values in its calculations, resulting in different values compared to the cleaned dataset. 
# This discrepancy arises because pandas employs distinct strategies for NA handling, rather than outright removal (as in dropna()).
data_sold[['age_at_purchase','price$']].corr()

Unnamed: 0,age_at_purchase,price$
age_at_purchase,1.0,-0.174439
price$,-0.174439,1.0


In [81]:
# Using the original data (which includes NA values) for correlation calculation would give the same result as the 'data_sold' dataset.
data[['age_at_purchase','price$']].corr()

Unnamed: 0,age_at_purchase,price$
age_at_purchase,1.0,-0.174439
price$,-0.174439,1.0
