## EDA
# Step - 1: Import the libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
import sklearn
import matplotlib.pyplot as plt


# Step - 2: Get the data

In [None]:
df=pd.read_csv(r'C:\Users\NSharma\Documents\datascience\Housing_Project\dataset\housing.csv')

In [None]:
df.head(5)


In [None]:
df.dtypes


In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.size

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df.describe()

In [None]:
df.hist(bins=50, figsize=(20,15))
plt.show()

# Insights:
1. There is only one categoerical variable: ocean_proximity
2. Total 10 features present
3. PROBLEM STATEMENT: To predict district's median housing price
4. Regression problem
5. Supervised learning approach
6. Proposed approach:
    Group data by districts and then see
7. total_bedroom feature has missing values
8. Target attribute is median_house_value which seems capped to 50000
9. median_income is not expressed in dollars
10. Many histograms are tail heavy, need to transform attributes to get more bell shaped distributions
    Unlike the bell curve with a "normal distribution," heavy-tailed distributions approach zero at a slower rate and can have outliers with very high values. ML models fit well on bell shaped distributions.
 

# Step - 3: Train Test split data

To avoid data snooping bias,let's put aside 20% data as test set

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)


# Step - 4: Explore the data

Make a copy of train_set.

In [None]:
df = train_set.copy()

In [None]:
df.head()

In [None]:
df.info()

# Insight: Check missing values
1. total_bedrooms feature might have missing values. Let's check all!
2. Datatypes are fine

In [None]:
missing_data = df.isnull()
missing_data.head()

In [None]:
for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts())
    print("")

# Insight:
Confirmed missing values in total_bedrooms feature. Let's fix it.

In [None]:
df['total_bedrooms'].describe()

In [None]:
df['total_bedrooms'].value_counts().idxmax()

In [None]:
df['total_bedrooms'].hist()
plt.show()

# Insight:

Replace missing values by 280 seems fine.
Though we can also see if missing vaues be according to locations

In [None]:
df['total_bedrooms'].replace(np.nan,280, inplace=True)


In [None]:
df['total_bedrooms'].info()

Now, let's explore more about each feature

Geographical data

In [None]:
%matplotlib inline
fig = plt.figure()
df.plot(kind='scatter',x='longitude',y='latitude', alpha=0.4)

In [None]:
df.plot(kind='scatter',x='longitude',y='latitude', alpha=0.5,s=df['population']/100,label='population',
c='median_house_value',cmap=plt.get_cmap("jet"),colorbar=True)

Let's put this on map to get some idea about what features are affecting the target variable

In [None]:
lat = df['latitude'].value_counts().idxmax()

In [None]:
long = df['longitude'].value_counts().idxmax()

In [None]:
#comment out due to space issue
'''import folium
loc_map = folium.Map(location=[lat,long],zoom_start=4,
tiles='Stamen Toner')
loc_map'''

In [None]:
#comment out due to space issue

'''for i in range(0,len(df)):
    folium.CircleMarker(location=[df.iloc[i]['latitude'],df.iloc[i]['longitude']],
                popup=df.iloc[i]['median_house_value'],radius=0.5,color='yellow').add_to(loc_map)
    
loc_map'''

# Insights:
1. Population density at coastal areas is high.
2. Combining scatter plot and map, we can see coastal areas have high median_house_value