# 2 - Further data exploration

In [2]:
# Importing packages

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Configuring matplotlib and pandas for plotting
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
# Import csv into dataframe
df = pd.read_csv('data/eda.csv', delimiter=",")

## Univariate Analysis - checking selected attributes

### Price Analysis

#### Central Tendency: mean, median, mode, quantiles

* Assuming that the value of an object is shown by the transaction connected to it.
* If a house has multiple transaction the object's value is defined as the average of all transactions connected to this house.

In [4]:
#
price_describe = df['price'].describe()

print(price_describe)

print("This takes not into account, that there are multiple transaction per house in the data set. Therefore the median over all prices has to be calculated on the mean per house.")
# Calculate average price per house
price_avg_per_house = df.groupby('house_id')['price'].mean()

print(f'Number of houses with: {price_avg_per_house.count()}')

# This gives the average of all averages house prices
price_mean_overall = round(price_avg_per_house.mean(), 2)

print(f"The mean of all average house prices is ${price_mean_overall:,.2f}.")

# This gives the median of all average house prices
price_median_overall = round(price_avg_per_house.median(), 2)

print(f"The median of all average house prices is ${price_median_overall:,.2f}.")

# This gives the mode of all average house prices
price_mode_overall = price_avg_per_house.mode()

for elem in price_mode_overall:
    print(f"The mode of all average house prices is: ${elem:,.2f}")
print('Price can be considered unimodal. It is considerable that mode and median are the same.')

# Calculating 1st and 3rd quartiles
price_q1 = price_avg_per_house.quantile(0.25)
price_q3 = price_avg_per_house.quantile(0.75)

# Calculating min and max
price_min = price_avg_per_house.min()
price_max = price_avg_per_house.max()

print(f'Price min ${price_min:,.2f} to 1st quartile ${price_q1:,.2f}, median ${price_median_overall:,.2f}, 3rd quartile ${price_q3:,.2f} to max ${price_max:,.2f}')

count     21597.000
mean     540296.574
std      367368.140
min       78000.000
25%      322000.000
50%      450000.000
75%      645000.000
max     7700000.000
Name: price, dtype: float64
This takes not into account, that there are multiple transaction per house in the data set. Therefore the median over all prices has to be calculated on the mean per house.
Number of houses with: 21420
The mean of all average house prices is $541,300.18.
The median of all average house prices is $450,000.00.
The mode of all average house prices is: $450,000.00
Price can be considered unimodal. It is considerable that mode and median are the same.
Price min $78,000.00 to 1st quartile $324,000.00, median $450,000.00, 3rd quartile $645,000.00 to max $7,700,000.00


#### Spread: Range, interquartile range, variance and standard deviation


In [5]:
price_range = price_avg_per_house.max() - price_avg_per_house.min()
price_iqr = price_avg_per_house.quantile(0.75) - price_avg_per_house.quantile(0.25)
price_variance = price_avg_per_house.var()
price_std = price_avg_per_house.std()

print(f" range: {price_range} \n iqr: {price_iqr}\n variance: {price_variance}\n std derivation: {price_std}")

 range: 7622000.0 
 iqr: 321000.0
 variance: 135177276869.2757
 std derivation: 367664.6255342982


#### Shape

Positive skewness indicates a longer tail on the right side of the distribution, while negative skewness indicates a longer tail on the left side.

Positive kurtosis indicates a distribution with heavier tails and a sharper peak, while negative kurtosis indicates a distribution with lighter tails and a flatter peak.

In [6]:
# Calculate skewness and kurtosis
skewness = price_avg_per_house.skew()
kurt = price_avg_per_house.kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 4.034506305333779
Kurtosis: 34.659291913661484


#### Outliers, missing values

In [7]:
# Define upper and lower bounds for outliers
lower_b = price_q1 - 1.5 * price_iqr
upper_b = price_q3 + 1.5 * price_iqr

# Identify outliers
outliers_below = df[(df['price'] < lower_b)]
outliers_above = df[(df['price'] > upper_b)]

print("Outliers:")
print(f"# of upper outliers: {outliers_above.shape[0]}")  # Use shape[0] to get the count
print(f"# of lower outliers: {outliers_below.shape[0]}")  # Use shape[0] to get the count


Outliers:
# of upper outliers: 1158
# of lower outliers: 0


In [8]:
# Check for missing price values in the entire DataFrame
missing_values = df['price'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


### Size; living and lot

#### Central Tendency / Overview by .describe()

In [9]:
df_sizes = df.groupby('house_id').agg({'sqft_lot': 'mean', 'sqft_living': 'mean'})

display(
    df_sizes.describe()
)


Unnamed: 0,sqft_lot,sqft_living
count,21420.0,21420.0
mean,15128.038,2083.133
std,41530.797,918.808
min,520.0,370.0
25%,5040.0,1430.0
50%,7614.0,1920.0
75%,10690.5,2550.0
max,1651359.0,13540.0


#### Spread: Range, interquartile range, variance and standard deviation


In [10]:
sqft_living_range = df_sizes['sqft_living'].max() - df_sizes['sqft_living'].min()
sqft_living_iqr = df_sizes['sqft_living'].quantile(0.75) - df_sizes['sqft_living'].quantile(0.25)
sqft_living_variance = df_sizes['sqft_living'].var()
sqft_living_std = df_sizes['sqft_living'].std()

print(f" range: {sqft_living_range} \n iqr: {sqft_living_iqr}\n variance: {sqft_living_variance}\n std derivation: {sqft_living_std}")

 range: 13170.0 
 iqr: 1120.0
 variance: 844208.8971562396
 std derivation: 918.8084115615396


In [11]:
sqft_lot_range = df_sizes['sqft_lot'].max() - df_sizes['sqft_lot'].min()
sqft_lot_iqr = df_sizes['sqft_lot'].quantile(0.75) - df_sizes['sqft_lot'].quantile(0.25)
sqft_lot_variance = df_sizes['sqft_lot'].var()
sqft_lot_std = df_sizes['sqft_lot'].std()

print(f" range: {sqft_lot_range} \n iqr: {sqft_lot_iqr}\n variance: {sqft_lot_variance}\n std derivation: {sqft_lot_std}")

 range: 1650839.0 
 iqr: 5650.5
 variance: 1724807085.9732513
 std derivation: 41530.79683768722


#### Shape

Positive skewness indicates a longer tail on the right side of the distribution, while negative skewness indicates a longer tail on the left side.

Positive kurtosis indicates a distribution with heavier tails and a sharper peak, while negative kurtosis indicates a distribution with lighter tails and a flatter peak.

In [12]:
# Calculate skewness and kurtosis for sqft lot
skewness = df_sizes['sqft_lot'].skew()
kurt = df_sizes['sqft_lot'].kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 13.056251852883623
Kurtosis: 284.49987477810896


In [13]:
# Calculate skewness and kurtosis for sqft living
skewness = df_sizes['sqft_living'].skew()
kurt = df_sizes['sqft_living'].kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 1.4727019951483347
Kurtosis: 5.258187001259873


#### Outliers, missing values

In [14]:
# Define upper and lower bounds for outliers
lower_b = df_sizes['sqft_living'].quantile(0.25) - 1.5 * sqft_living_iqr
upper_b = df_sizes['sqft_living'].quantile(0.75) + 1.5 * sqft_living_iqr

# Identify outliers
outliers_below = df[(df['sqft_living'] < lower_b)]
outliers_above = df[(df['sqft_living'] > upper_b)]

print("Outliers:")
print(f"# of upper outliers: {outliers_above.shape[0]}")  # Use shape[0] to get the count
print(f"# of lower outliers: {outliers_below.shape[0]}")  # Use shape[0] to get the count


Outliers:
# of upper outliers: 571
# of lower outliers: 0


In [15]:
# Check for missing price values in the entire DataFrame
missing_values = df['sqft_living'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


In [16]:
# Check for missing price values in the entire DataFrame
missing_values = df['sqft_lot'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


### yr_built / Age
 13  yr_renovated

#### Central Tendency / Overview by .describe()

In [17]:
df_age = df.groupby('house_id').agg({'yr_built': 'mean'})
df_age['age'] = df_age['yr_built'].apply(lambda x: 2015 - x)

display(
    df_age.describe()
)


Unnamed: 0,yr_built,age
count,21420.0,21420.0
mean,1971.093,43.907
std,29.387,29.387
min,1900.0,0.0
25%,1952.0,18.0
50%,1975.0,40.0
75%,1997.0,63.0
max,2015.0,115.0


#### Spread: Range, iqr, variance, std

In [18]:
age_range = df_age['age'].max() - df_age['age'].min()
age_iqr = df_age['age'].quantile(0.75) - df_age['age'].quantile(0.25)
age_variance = df_age['age'].var()
age_std = df_age['age'].std()

print(f" range: {age_range} \n iqr: {age_iqr}\n variance: {age_variance}\n std derivation: {age_std}")

 range: 115.0 
 iqr: 45.0
 variance: 863.6040314477722
 std derivation: 29.38714057964422


#### Shape

In [19]:
# Calculate skewness and kurtosis
skewness = df_age['age'].skew()
kurt = df_age['age'].kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 0.4742219932841571
Kurtosis: -0.6546393404166202


#### Outliers, missing values

In [20]:
# Define upper and lower bounds for outliers
lower_b = df_age['age'].quantile(0.25) - 1.5 * age_iqr
upper_b = df_age['age'].quantile(0.75) + 1.5 * age_iqr

# Identify outliers
outliers_below = df_age[(df_age['age'] < lower_b)]
outliers_above = df_age[(df_age['age'] > upper_b)]

print("Outliers:")
print(f"# of upper outliers: {outliers_above.shape[0]}")  # Use shape[0] to get the count
print(f"# of lower outliers: {outliers_below.shape[0]}")  # Use shape[0] to get the count


Outliers:
# of upper outliers: 0
# of lower outliers: 0


In [21]:
# Check for missing price values in the entire DataFrame
missing_values = df['yr_built'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


### yr_renovated / renovation age [Stopped bc. corrupt data]

#### Central Tendency / Overview by .describe

In [22]:
print(df['yr_renovated'].unique())


df_ren_age = df.groupby('house_id').agg({'yr_renovated': 'mean'})
df_ren_age['age'] = df_ren_age['yr_renovated'].apply(lambda x: 2015 - x)

display(
    df_ren_age.describe()
)


[    0. 20130.    nan 19730. 20100. 19910. 19790. 20010. 20120. 19860.
 19900. 20030. 19620. 19920. 20060. 19400. 19550. 20070. 20140. 19890.
 19820. 20050. 20000. 19540. 19960. 20150. 19830. 19600. 19720. 19970.
 19940. 19450. 20040. 19700. 19950. 19990. 20080. 19840. 20110. 19980.
 19880. 20090. 19670. 19690. 20020. 19770. 19870. 19650. 19640. 19580.
 19680. 19850. 19630. 19800. 19740. 19810. 19500. 19560. 19570. 19930.
 19750. 19460. 19480. 19780. 19760. 19340. 19590. 19530. 19440. 19510.
 19710.]


Unnamed: 0,yr_renovated,age
count,17609.0,17609.0
mean,838.806,1176.194
std,4005.127,4005.127
min,0.0,-18135.0
25%,0.0,2015.0
50%,0.0,2015.0
75%,0.0,2015.0
max,20150.0,2015.0


### Condition (Rating)

#### Central tendency / Overview per .describe()

In [23]:
df_condition = df.groupby('house_id').agg({'condition': 'mean'})

display(
    df_condition.describe()
)


Unnamed: 0,condition
count,21420.0
mean,3.411
std,0.65
min,1.0
25%,3.0
50%,3.0
75%,4.0
max,5.0


### Grade (Rating) 

#### Overall tendency / Overview per .describe()

In [24]:
df_rating = df.groupby('house_id').agg({'grade': 'mean'})

display(
    df_rating.describe()
)


Unnamed: 0,grade
count,21420.0
mean,7.663
std,1.172
min,3.0
25%,7.0
50%,7.0
75%,8.0
max,13.0


#### Checking for missing values

In [25]:
# Check for missing price values in the entire DataFrame
missing_values = df['grade'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)

Missing values:
0


### Number of Transactions


In [26]:
df_num_of_transaction = df.groupby('house_id')['id.1'].count()

df_spread_of_transactions = df_num_of_transaction.groupby(df_num_of_transaction).count()

display(
    df_num_of_transaction.describe(),
    df_spread_of_transactions
)



count   21420.000
mean        1.008
std         0.091
min         1.000
25%         1.000
50%         1.000
75%         1.000
max         3.000
Name: id.1, dtype: float64

id.1
1    21244
2      175
3        1
Name: id.1, dtype: int64

### Objects per Zip Code

In [27]:
df_houses_per_zip = df.groupby('zipcode')['house_id'].count()

display(
    df_houses_per_zip.describe()
    
)

count    70.000
mean    308.529
std     142.224
min      50.000
25%     204.250
50%     282.500
75%     408.250
max     602.000
Name: house_id, dtype: float64