# EDA

In [1]:
# Importing packages

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# Configuring matplotlib and pandas for plotting
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# Import csv into dataframe
df = pd.read_csv('data/eda.csv', delimiter=",")

## Data Understanding

### First overview of dataframe

In [47]:
# Examination of the usual suspects: head, tail, unique values etc. 
display(  
    df.info(),
    df.head(),
    df.tail(),
    df.describe()
)

# Extracting the column names as a list

column_names = df.columns.values.tolist()
print(column_names)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   bedrooms       21597 non-null  float64
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  float64
 4   sqft_lot       21597 non-null  float64
 5   floors         21597 non-null  float64
 6   waterfront     19206 non-null  float64
 7   view           21534 non-null  float64
 8   condition      21597 non-null  int64  
 9   grade          21597 non-null  int64  
 10  sqft_above     21597 non-null  float64
 11  sqft_basement  21145 non-null  float64
 12  yr_built       21597 non-null  int64  
 13  yr_renovated   17749 non-null  float64
 14  zipcode        21597 non-null  int64  
 15  lat            21597 non-null  float64
 16  long           21597 non-null  float64
 17  sqft_living15  21597 non-null  float64
 18  sqft_l

None

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price,house_id,id.1
0,1000102,6.0,3.0,2400.0,9373.0,2.0,,0.0,3,7,...,0.0,98002,47.326,-122.214,2060.0,7316.0,2015-04-22,300000.0,1000102,2496
1,1000102,6.0,3.0,2400.0,9373.0,2.0,,0.0,3,7,...,0.0,98002,47.326,-122.214,2060.0,7316.0,2014-09-16,280000.0,1000102,2495
2,1200019,4.0,1.75,2060.0,26036.0,1.0,,0.0,4,8,...,0.0,98166,47.444,-122.351,2590.0,21891.0,2014-05-08,647500.0,1200019,6730
3,1200021,3.0,1.0,1460.0,43000.0,1.0,0.0,0.0,3,7,...,0.0,98166,47.443,-122.347,2250.0,20023.0,2014-08-11,400000.0,1200021,8405
4,2800031,3.0,1.0,1430.0,7599.0,1.5,0.0,0.0,4,6,...,0.0,98168,47.478,-122.265,1290.0,10320.0,2015-04-01,235000.0,2800031,8801


Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price,house_id,id.1
21592,9842300095,5.0,2.0,1600.0,4168.0,1.5,0.0,0.0,3,7,...,0.0,98126,47.53,-122.381,1190.0,4168.0,2014-07-25,365000.0,9842300095,16724
21593,9842300485,2.0,1.0,1040.0,7372.0,1.0,0.0,0.0,5,7,...,0.0,98126,47.529,-122.378,1930.0,5150.0,2015-03-11,380000.0,9842300485,3258
21594,9842300540,3.0,1.0,1100.0,4128.0,1.0,0.0,0.0,4,7,...,,98126,47.53,-122.379,1510.0,4538.0,2014-06-24,339000.0,9842300540,7615
21595,9895000040,2.0,1.75,1410.0,1005.0,1.5,0.0,0.0,3,9,...,0.0,98027,47.545,-122.018,1440.0,1188.0,2014-07-03,399900.0,9895000040,20964
21596,9900000190,3.0,1.0,1320.0,8100.0,1.0,0.0,0.0,3,6,...,,98166,47.47,-122.351,1000.0,8100.0,2014-10-30,268950.0,9900000190,15938


Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,house_id,id.1
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,19206.0,21534.0,21597.0,21597.0,...,21597.0,17749.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,4580474287.771,3.373,2.116,2080.322,15099.409,1.494,0.008,0.234,3.41,7.658,...,1971.0,836.651,98077.952,47.56,-122.214,1986.62,12758.284,540296.574,4580474287.771,10799.0
std,2876735715.748,0.926,0.769,918.106,41412.637,0.54,0.087,0.766,0.651,1.173,...,29.375,4000.111,53.513,0.139,0.141,685.23,27274.442,367368.14,2876735715.748,6234.661
min,1000102.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,...,1900.0,0.0,98001.0,47.156,-122.519,399.0,651.0,78000.0,1000102.0,1.0
25%,2123049175.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,...,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0,322000.0,2123049175.0,5400.0
50%,3904930410.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,...,1975.0,0.0,98065.0,47.572,-122.231,1840.0,7620.0,450000.0,3904930410.0,10799.0
75%,7308900490.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,...,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0,645000.0,7308900490.0,16198.0
max,9900000190.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,...,2015.0,20150.0,98199.0,47.778,-121.315,6210.0,871200.0,7700000.0,9900000190.0,21597.0


['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date', 'price', 'house_id', 'id.1']


### Understanding the columns

In [28]:
display(
# Understanding different values of waterfront attribute
df['waterfront'].unique(),

# Understanding different values of view attribute
df['view'].unique(),

# Understanding different values of condition attribute
df['condition'].unique(),

# Understanding different values of grade attribute
np.sort(df['grade'].unique()),

)

array([nan,  0.,  1.])

array([ 0.,  2.,  1.,  3.,  4., nan])

array([3, 4, 5, 2, 1])

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])




| ID            | Explanation                            | Renaming? | DT       | DT trans? | Missing values? | Other remarks:                      |
|---------------|----------------------------------------|-----------|----------|-----------|-----------------|-------------------------------------|
| id            | house id                               | house_id  | int      | -         | -               | not unique due join                 |
| bedrooms      | # of bedrooms in house                 | -         | float    | int       |                 |                                     |
| bathrooms     | # of bathrooms in house                | -         | float    | int       |                 |                                     |
| sqft_living   | size of living area in squarefoot      | -         | float    | -         | no              | could be integers, seems rounded    |
| sqft_lot      | size of hole [property up to boundary](https://www.yourownarchitect.com/what-is-the-difference-between-floor-area-and-lot-area/)   | -       | float    | -         | no                |                                     |
| floors        | # of floors                            | -         | float    | int       |                 |                                     |
| waterfront    | is object located at waterfront        | -         | float    | int       | yes             | 0 = no, 1 = 1                       |
| view          | # of views by interested ppl | -         | float    | int       | yes             | 1 to 4                              |
| condition     | rating of the object's overall condition, based on based on [King County grading system](https://info.kingcounty.gov/assessor/esales/Glossary.aspx?type=r#b)                           | -         | int      | -         |                 | 1 to 5                              |
| grade         | overall grade given to the housing unit, based on [King County grading system](https://info.kingcounty.gov/assessor/esales/Glossary.aspx?type=r#b)|           | int      |           |                 | 3 to 13                             |
| sqft_above    | size of upper levels                   | -         | float    |           |                 | could be integers, seems rounded    |
| sqft_basement | size of basement                       | -         | float    |           |                 | could be integers, seems rounded    |
| yr_built      | year house was build                   | -         | int      |           | no              | 0 and Nan, additional '0.' attached |
| yr_renovated  | year house was renovated               | -         | float    | int       |                 |                                     |
| zipcode       | zipcode of objects address             | -         | int      |           | no              |                                     |
| lat           | latitude of obj geographical pos.      | -         | float    |           |                 |                                     |
| long          | longitude of obj geographical pos.     | -         | float    |           |                 |                                     |
| sqft_living15 | sqft interior housing living space for nearest 15 neighbors | - | float  ||                 | could be integers, seems rounded    |
| sqft_lot15    | sqft of land lots of  nearest 15 neighbors | -     | float    |           |                 | could be integers, seems rounded    |
| date          | date transaction was conducted         | -         | datetime |           |                 |                                     |
| price         | price o. object in $                   | -         | float    |           |                 |                                     |
| house_id      | same as 'id' above                     | -         | int      |           | no              |                                     |
| id.1          | transaction id                         | -         | int      |           | no              |                                     |

# Hypothesis

## Background

For this EDA task, I've chosen a buyer's profile at random. For her I'm supposed to identify matching objects, generating insights and corresponding advice.

**Nicole Johnson, Buyer**


>Looks for a lively, central neighborhood, middle price range, right timing (within a year).


## Further assumptions regarding my client:

These assumptions will potentially help to understand the client's need better and to personalize the 
* Living alone, not interested in kids in the future.
* Client's age 30 - 50; because she has already some money to work with, still interested in the lively areas of a city and not the calm suburbs.
* She not in a hurry, but kean following her dreams, hence the time frame of 1 year. As a person with a clear vision for her life, she enjoys actionable and precise recommendations.
* That could be her on [LinkedIn](https://www.linkedin.com/in/nicole-johnson-32b23a75/)

## Hypothesis
* Time of the year will affect prices to a great extend (e.g. prices in summer 10% higher than winter)
* Lively, central neighborhoods are high in demand and therefore will mostly be in higher price ranges. As a result central neighborhood AND middle price range could be  excluding each other
* Objects that are sold very often within are year could be an indicator for
  * being NOT a lively neighborhood, sind lively means also stable communities.
  * overpriced objects since they are part of speculations
  * other downsides (e.g. crime) that are just noticeable after some time. This would come together with a downfall of prices

## Possible connection between client and attributes / Challenges after data understanding
1. "lively central neighborhood" - not represented in data
   1. Option 1: Distance to a specific long/lat of center must be calculated
   2. Option 2: use dictionary to translate all zip code to "central", "close to central", "outskirt", "far away". Do the same for "lively"
2. middle price range has to be calculated. depends on price spread
3. make assumptions when object prices are lower within a year
4. Decide whether objects with many or few transactions are are sign of quality or problems - and vice versa
5. Decide whether objects with many or few views are are sign of quality or problems - and vice versa


# Data Exploration

## Univariate Analysis - checking selected attributes

### Price Analysis

#### Central Tendency: mean, median, mode, quantiles

* Assuming that the value of an object is shown by the transaction connected to it.
* If a house has multiple transaction the object's value is defined as the average of all transactions connected to this house.

In [100]:
#
price_describe = df['price'].describe()

print(price_describe)

print("This takes not into account, that there are multiple transaction per house in the data set. Therefore the median over all prices has to be calculated on the mean per house.")
# Calculate average price per house
price_avg_per_house = df.groupby('house_id')['price'].mean()

print(f'Number of houses with: {price_avg_per_house.count()}')

# This gives the average of all averages house prices
price_mean_overall = round(price_avg_per_house.mean(), 2)

print(f"The mean of all average house prices is ${price_mean_overall:,.2f}.")

# This gives the median of all average house prices
price_median_overall = round(price_avg_per_house.median(), 2)

print(f"The median of all average house prices is ${price_median_overall:,.2f}.")

# This gives the mode of all average house prices
price_mode_overall = price_avg_per_house.mode()

for elem in price_mode_overall:
    print(f"The mode of all average house prices is: ${elem:,.2f}")
print('Price can be considered unimodal. It is considerable that mode and median are the same.')

# Calculating 1st and 3rd quartiles
price_q1 = price_avg_per_house.quantile(0.25)
price_q3 = price_avg_per_house.quantile(0.75)

# Calculating min and max
price_min = price_avg_per_house.min()
price_max = price_avg_per_house.max()

print(f'Price min ${price_min:,.2f} to 1st quartile ${price_q1:,.2f}, median ${price_median_overall:,.2f}, 3rd quartile ${price_q3:,.2f} to max ${price_max:,.2f}')

count     21597.000
mean     540296.574
std      367368.140
min       78000.000
25%      322000.000
50%      450000.000
75%      645000.000
max     7700000.000
Name: price, dtype: float64
This takes not into account, that there are multiple transaction per house in the data set. Therefore the median over all prices has to be calculated on the mean per house.
Number of houses with: 21420
The mean of all average house prices is $541,300.18.
The median of all average house prices is $450,000.00.
The mode of all average house prices is: $450,000.00
Price can be considered unimodal. It is considerable that mode and median are the same.
Price min $78,000.00 to 1st quartile $324,000.00, median $450,000.00, 3rd quartile $645,000.00 to max $7,700,000.00


#### Spread: Range, interquartile range, variance and standard deviation


In [72]:
price_range = price_avg_per_house.max() - price_avg_per_house.min()
price_iqr = price_avg_per_house.quantile(0.75) - price_avg_per_house.quantile(0.25)
price_variance = price_avg_per_house.var()
price_std = price_avg_per_house.std()

print(f" range: {price_range} \n iqr: {price_iqr}\n variance: {price_variance}\n std derivation: {price_std}")

 range: 7622000.0 
 iqr: 321000.0
 variance: 135177276869.2757
 std derivation: 367664.6255342982


#### Shape

Positive skewness indicates a longer tail on the right side of the distribution, while negative skewness indicates a longer tail on the left side.

Positive kurtosis indicates a distribution with heavier tails and a sharper peak, while negative kurtosis indicates a distribution with lighter tails and a flatter peak.

In [73]:
# Calculate skewness and kurtosis
skewness = price_avg_per_house.skew()
kurt = price_avg_per_house.kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 4.034506305333779
Kurtosis: 34.659291913661484


#### Outliers, missing values

In [79]:
# Define upper and lower bounds for outliers
lower_b = price_q1 - 1.5 * price_iqr
upper_b = price_q3 + 1.5 * price_iqr

# Identify outliers
outliers_below = df[(df['price'] < lower_b)]
outliers_above = df[(df['price'] > upper_b)]

print("Outliers:")
print(f"# of upper outliers: {outliers_above.shape[0]}")  # Use shape[0] to get the count
print(f"# of lower outliers: {outliers_below.shape[0]}")  # Use shape[0] to get the count


Outliers:
# of upper outliers: 1158
# of lower outliers: 0


In [81]:
# Check for missing price values in the entire DataFrame
missing_values = df['price'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


### Size; living and lot

In [123]:
df_sizes = df.groupby('house_id').agg({'sqft_lot': 'mean', 'sqft_living': 'mean'})

display(
    df_sizes.head(20),
    df_sizes.shape,
    df_sizes.describe()
)


Unnamed: 0_level_0,sqft_lot,sqft_living
house_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000102,9373.0,2400.0
1200019,26036.0,2060.0
1200021,43000.0,1460.0
2800031,7599.0,1430.0
3600057,3504.0,1650.0
3600072,5310.0,2220.0
3800008,18200.0,1990.0
5200087,5001.0,2540.0
6200017,21336.0,1340.0
7200080,10585.0,1980.0


(21420, 2)

Unnamed: 0,sqft_lot,sqft_living
count,21420.0,21420.0
mean,15128.038,2083.133
std,41530.797,918.808
min,520.0,370.0
25%,5040.0,1430.0
50%,7614.0,1920.0
75%,10690.5,2550.0
max,1651359.0,13540.0


#### detour: checking if there are houses with the same size than the lot or close to

In [122]:
filtered_df = df_sizes[df_sizes['sqft_living'] < df_sizes['sqft_lot']]

threshold_percent = 1

similar_house_ids = []

for house_id, group in filtered_df.groupby('house_id'):
    sqft_lot_values = group['sqft_lot'].values
    sqft_living_values = group['sqft_living'].values
    
    # Calculate the maximum allowed difference in sqft_lot and sqft_living
    max_difference = max(sqft_lot_values) * (threshold_percent / 100)
    
    # Check if the difference between sqft_lot and sqft_living is within the threshold
    if all(abs(a - b) <= max_difference for a, b in zip(sqft_lot_values, sqft_living_values)):
        similar_house_ids.append(house_id)

print(similar_house_ids)


[1332700200, 1773100430, 1931300412, 2025049192, 2254501342, 2767600985, 2767603824, 2767704649, 2770601769, 2770604081, 2770604082, 2771102144, 2902200142, 3277800729, 4447300008, 5015001452, 5695000142, 6781200013, 8562770250, 9396700024, 9528104660]


#### Spread: Range, interquartile range, variance and standard deviation


In [124]:
sqft_living_range = df_sizes['sqft_living'].max() - df_sizes['sqft_living'].min()
sqft_living_iqr = df_sizes['sqft_living'].quantile(0.75) - df_sizes['sqft_living'].quantile(0.25)
sqft_living_variance = df_sizes['sqft_living'].var()
sqft_living_std = df_sizes['sqft_living'].std()

print(f" range: {sqft_living_range} \n iqr: {sqft_living_iqr}\n variance: {sqft_living_variance}\n std derivation: {sqft_living_std}")

 range: 13170.0 
 iqr: 1120.0
 variance: 844208.8971562396
 std derivation: 918.8084115615396


In [125]:
sqft_lot_range = df_sizes['sqft_lot'].max() - df_sizes['sqft_lot'].min()
sqft_lot_iqr = df_sizes['sqft_lot'].quantile(0.75) - df_sizes['sqft_lot'].quantile(0.25)
sqft_lot_variance = df_sizes['sqft_lot'].var()
sqft_lot_std = df_sizes['sqft_lot'].std()

print(f" range: {sqft_lot_range} \n iqr: {sqft_lot_iqr}\n variance: {sqft_lot_variance}\n std derivation: {sqft_lot_std}")

 range: 1650839.0 
 iqr: 5650.5
 variance: 1724807085.9732513
 std derivation: 41530.79683768722


#### Shape

Positive skewness indicates a longer tail on the right side of the distribution, while negative skewness indicates a longer tail on the left side.

Positive kurtosis indicates a distribution with heavier tails and a sharper peak, while negative kurtosis indicates a distribution with lighter tails and a flatter peak.

In [126]:
# Calculate skewness and kurtosis for sqft lot
skewness = df_sizes['sqft_lot'].skew()
kurt = df_sizes['sqft_lot'].kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 13.056251852883623
Kurtosis: 284.49987477810896


In [127]:
# Calculate skewness and kurtosis for sqft living
skewness = df_sizes['sqft_living'].skew()
kurt = df_sizes['sqft_living'].kurtosis()

# Print the results
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

Skewness: 1.4727019951483347
Kurtosis: 5.258187001259873


#### Outliers & missing values

In [129]:
# Define upper and lower bounds for outliers
lower_b = df_sizes['sqft_living'].quantile(0.25) - 1.5 * sqft_living_iqr
upper_b = df_sizes['sqft_living'].quantile(0.75) + 1.5 * sqft_living_iqr

# Identify outliers
outliers_below = df[(df['sqft_living'] < lower_b)]
outliers_above = df[(df['sqft_living'] > upper_b)]

print("Outliers:")
print(f"# of upper outliers: {outliers_above.shape[0]}")  # Use shape[0] to get the count
print(f"# of lower outliers: {outliers_below.shape[0]}")  # Use shape[0] to get the count


Outliers:
# of upper outliers: 571
# of lower outliers: 0


In [130]:
# Check for missing price values in the entire DataFrame
missing_values = df['sqft_living'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


In [131]:
# Check for missing price values in the entire DataFrame
missing_values = df['sqft_lot'].isnull().sum()

# Print the count of missing values for price column
print("Missing values:")
print(missing_values)


Missing values:
0


Age
Number of Transactions
Renovation
Objects per Zip Code
Condition Grade