# 1 - Basic data understanding and hypothesis

In [1]:
# Importing packages

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Configuring matplotlib and pandas for plotting
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# Import csv into dataframe
df = pd.read_csv('data/eda.csv', delimiter=",")

## Data Understanding

### First overview of dataframe

In [3]:
# Examination of the usual suspects: head, tail, unique values etc. 
display(  
    df.info(),
    df.head(),
    df.tail(),
    df.describe()
)

# Extracting the column names as a list

column_names = df.columns.values.tolist()
print(column_names)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   bedrooms       21597 non-null  float64
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  float64
 4   sqft_lot       21597 non-null  float64
 5   floors         21597 non-null  float64
 6   waterfront     19206 non-null  float64
 7   view           21534 non-null  float64
 8   condition      21597 non-null  int64  
 9   grade          21597 non-null  int64  
 10  sqft_above     21597 non-null  float64
 11  sqft_basement  21145 non-null  float64
 12  yr_built       21597 non-null  int64  
 13  yr_renovated   17749 non-null  float64
 14  zipcode        21597 non-null  int64  
 15  lat            21597 non-null  float64
 16  long           21597 non-null  float64
 17  sqft_living15  21597 non-null  float64
 18  sqft_l

None

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price,house_id,id.1
0,1000102,6.0,3.0,2400.0,9373.0,2.0,,0.0,3,7,...,0.0,98002,47.326,-122.214,2060.0,7316.0,2015-04-22,300000.0,1000102,2496
1,1000102,6.0,3.0,2400.0,9373.0,2.0,,0.0,3,7,...,0.0,98002,47.326,-122.214,2060.0,7316.0,2014-09-16,280000.0,1000102,2495
2,1200019,4.0,1.75,2060.0,26036.0,1.0,,0.0,4,8,...,0.0,98166,47.444,-122.351,2590.0,21891.0,2014-05-08,647500.0,1200019,6730
3,1200021,3.0,1.0,1460.0,43000.0,1.0,0.0,0.0,3,7,...,0.0,98166,47.443,-122.347,2250.0,20023.0,2014-08-11,400000.0,1200021,8405
4,2800031,3.0,1.0,1430.0,7599.0,1.5,0.0,0.0,4,6,...,0.0,98168,47.478,-122.265,1290.0,10320.0,2015-04-01,235000.0,2800031,8801


Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price,house_id,id.1
21592,9842300095,5.0,2.0,1600.0,4168.0,1.5,0.0,0.0,3,7,...,0.0,98126,47.53,-122.381,1190.0,4168.0,2014-07-25,365000.0,9842300095,16724
21593,9842300485,2.0,1.0,1040.0,7372.0,1.0,0.0,0.0,5,7,...,0.0,98126,47.529,-122.378,1930.0,5150.0,2015-03-11,380000.0,9842300485,3258
21594,9842300540,3.0,1.0,1100.0,4128.0,1.0,0.0,0.0,4,7,...,,98126,47.53,-122.379,1510.0,4538.0,2014-06-24,339000.0,9842300540,7615
21595,9895000040,2.0,1.75,1410.0,1005.0,1.5,0.0,0.0,3,9,...,0.0,98027,47.545,-122.018,1440.0,1188.0,2014-07-03,399900.0,9895000040,20964
21596,9900000190,3.0,1.0,1320.0,8100.0,1.0,0.0,0.0,3,6,...,,98166,47.47,-122.351,1000.0,8100.0,2014-10-30,268950.0,9900000190,15938


Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,house_id,id.1
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,19206.0,21534.0,21597.0,21597.0,...,21597.0,17749.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,4580474287.771,3.373,2.116,2080.322,15099.409,1.494,0.008,0.234,3.41,7.658,...,1971.0,836.651,98077.952,47.56,-122.214,1986.62,12758.284,540296.574,4580474287.771,10799.0
std,2876735715.748,0.926,0.769,918.106,41412.637,0.54,0.087,0.766,0.651,1.173,...,29.375,4000.111,53.513,0.139,0.141,685.23,27274.442,367368.14,2876735715.748,6234.661
min,1000102.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,...,1900.0,0.0,98001.0,47.156,-122.519,399.0,651.0,78000.0,1000102.0,1.0
25%,2123049175.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,...,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0,322000.0,2123049175.0,5400.0
50%,3904930410.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,...,1975.0,0.0,98065.0,47.572,-122.231,1840.0,7620.0,450000.0,3904930410.0,10799.0
75%,7308900490.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,...,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0,645000.0,7308900490.0,16198.0
max,9900000190.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,...,2015.0,20150.0,98199.0,47.778,-121.315,6210.0,871200.0,7700000.0,9900000190.0,21597.0


['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date', 'price', 'house_id', 'id.1']


### Understanding the columns

In [4]:
display(
# Understanding different values of waterfront attribute
df['waterfront'].unique(),

# Understanding different values of view attribute
df['view'].unique(),

# Understanding different values of condition attribute
df['condition'].unique(),

# Understanding different values of grade attribute
np.sort(df['grade'].unique()),

)

array([nan,  0.,  1.])

array([ 0.,  2.,  1.,  3.,  4., nan])

array([3, 4, 5, 2, 1])

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])




| ID            | Explanation                            | Task?     | DT       | DT trans? | Missing values? | Other remarks:                      |
|---------------|----------------------------------------|-----------|----------|-----------|-----------------|-------------------------------------|
| id            | house id                               | drop      | int      | -         | -               | not unique due join                 |
| bedrooms      | # of bedrooms in house                 | dt transf | float    | int       |                 |                                     |
| bathrooms     | # of bathrooms in house                | dt transf | float    | int       |                 |                                     |
| sqft_living   | size of living area in squarefoot      | -         | float    | -         | no              | could be integers, seems rounded    |
| sqft_lot      | size of hole [property up to boundary](https://www.yourownarchitect.com/what-is-the-difference-between-floor-area-and-lot-area/)   | -       | float    | -         | no                |                                     |
| floors        | # of floors                            | dt transf | float    | int       |                 |                                     |
| waterfront    | is object located at waterfront        | fix       | float    | int       | yes             | 0 = no, 1 = 1                       |
| view          | # of views by interested ppl           | fix       | float    | int       | yes             | 1 to 4                              |
| condition     | rating of the object's overall condition, based on based on [King County grading system](https://info.kingcounty.gov/assessor/esales/Glossary.aspx?type=r#b)                           | -         | int      | -         | no              | 1 to 5                              |
| grade         | overall grade given to the housing unit, based on [King County grading system](https://info.kingcounty.gov/assessor/esales/Glossary.aspx?type=r#b)|           | int      |           |                 | 3 to 13                             |
| sqft_above    | size of upper levels                   | -         | float    |           |                 | could be integers, seems rounded    |
| sqft_basement | size of basement                       | -         | float    |           |                 | could be integers, seems rounded    |
| yr_built      | year house was build                   | -         | int      |           | no              | 0 and Nan, additional '0.' attached |
| yr_renovated  | year house was renovated               | fix       | float    | int       | yes                |                                     |
| zipcode       | zipcode of objects address             | -         | int      |           | no              |                                     |
| lat           | latitude of obj geographical pos.      | -         | float    |           |                 |                                     |
| long          | longitude of obj geographical pos.     | -         | float    |           |                 |                                     |
| sqft_living15 | sqft interior housing living space for nearest 15 neighbors | - | float  ||                 | could be integers, seems rounded    |
| sqft_lot15    | sqft of land lots of  nearest 15 neighbors | -     | float    |           |                 | could be integers, seems rounded    |
| date          | date transaction was conducted         | -         | datetime |           | no              |                                     |
| price         | price o. object in $                   | -         | float    |           | no              |                                     |
| house_id      | same as 'id' above                     | -         | int      |           | no              |                                     |
| id.1          | transaction id                         | Rename    | int      |           | no              |                                     |

## Assumptions and hypothesis

### Background

For this EDA task, I've chosen a buyer's profile at random. For her I'm supposed to identify matching objects, generating insights and corresponding advice.

**Nicole Johnson, Buyer**


>Looks for a lively, central neighborhood, middle price range, right timing (within a year).


### Further assumptions regarding my client:

These assumptions will potentially help to understand the client's need better and to personalize the 
* Living alone, not interested in kids in the future.
* Client's age 30 - 50; because she has already some money to work with, still interested in the lively areas of a city and not the calm suburbs.
* She not in a hurry, but kean following her dreams, hence the time frame of 1 year. As a person with a clear vision for her life, she enjoys actionable and precise recommendations.
* That could be her on [LinkedIn](https://www.linkedin.com/in/nicole-johnson-32b23a75/)

### Hypothesis
1. Time of the year will affect prices to a great extend (e.g. prices in summer 10% higher than winter). As a result buying at a specific time will save my client money
2. Lively, central neighborhoods are close to the city-center.
3. Lively, central neighborhoods are high in demand and therefore which will be shown in higher prices ranges.
4. Houses where sqft lot is more than 2% higher than sqft-living - given a 1 story object - are not in the city center 
5. Objects that get sold multiple times in 5 years are probably bad in grade and/or condition 

### Possible connection between client and attributes / Challenges after data understanding
1. "lively central neighborhood" - not represented in data
   1. Option 1: Distance to a specific long/lat of center must be calculated
   2. Option 2: use dictionary to translate all zip code to "central", "close to central", "outskirt", "far away". Do the same for "lively"
2. middle price range has to be calculated. depends on price spread
3. make assumptions when object prices are lower within a year
4. Decide whether objects with many or few transactions are are sign of quality or problems - and vice versa
5. Decide whether objects with many or few views are are sign of quality or problems - and vice versa
