-
Notifications
You must be signed in to change notification settings - Fork 0
/
Workshop5_California_Housing_EDA.py
73 lines (55 loc) · 2.53 KB
/
Workshop5_California_Housing_EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Python Workshop 5 - California Housing Dataset
# More information on this dataset can be found at this link: https://www.kaggle.com/camnugent/california-housing-prices
# author Sai Narendrula
# email sn42@rice.edu
# date April 18th, 2021
# Make sure these packages are available in your project
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# To get the data, use pd.read_csv. The link is to a webpage on github that has the data
housing_data = pd.read_csv('https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-California-Housing-Prices/master/Data/housing.csv')
# The data is stored in a DataFrame called housing_data
# To get the size, use len() or .shape
print(len(housing_data))
print(housing_data.shape)
# To get all columns, use .columns()
print(housing_data.columns())
# Use .head() to see the first five values from each column in the table
print(housing_data.head())
# To get random rows from the dataset, we can use .sample(n) where n is the number of samples you want
print(housing_data.sample(6))
# Use .describe() to get the statistics from all columns that have numbers
print(housing_data.describe())
# To see what unique values or the number of unique values that exist in a Series (column of the DataFrame)
# use .unique() and .nunique()
print(housing_data["ocean_proximity"].unique())
print(housing_data["ocean_proximity"].nunique())
# Visualising geographical data using seaborn
# Use sns.scatterplot() to look at where the house blocks are
sns.scatterplot(x="longitude", y="latitude", data=housing_data)
# alpha allows you to fade out data points
sns.scatterplot(x="longitude", y="latitude", data=housing_data, alpha=0.1)
# Look at how median house incomes changes in where people live
fig = sns.scatterplot(
x="longitude",
y="latitude",
data=housing_data,
alpha=0.1,
hue="median_house_value",
palette="viridis",
size=housing_data["population"] / 100
)
# place legend outside of figure
fig.legend(loc="center left", bbox_to_anchor=(1.01, 0.6), ncol=1);
# Total median income V/S Median house value
fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(housing_data['median_income'], housing_data['median_house_value'])
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
# Check for correlation with each features
housing_data_corr = housing_data.corr()
print(housing_data_corr)
# Visualize the correlation with help of heatmap
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(housing_data_corr, annot=True)