In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train.head()

In [None]:
cols = ['MSZoning', 'LotFrontage', 'LotArea', 'Alley', 'OverallQual', 'YearBuilt',\
        'RoofStyle', 'Exterior1st', 'ExterCond', 'CentralAir',  'HalfBath']

In [None]:
train[cols].head()

In [None]:
train[cols].shape

In [None]:
train[cols].describe()

In [None]:
for col in cols:
    print(train[col].value_counts())

## Notes
- MSZoning: Categorical
- LotFrontage: Numerical Continuous
- LotArea: Numerical Continuous
- Alley: Categorical
- OverallQual: Ordinal
- YearBuilt: Numerical Discrete
- RoofStyle: Categorical
- Exterioirlst: Categorical
- ExterCond: Categorical
- CentralAir: Categorical
- HalfBath: Oridinal

In [None]:
for col in cols:
    print('{}: {}%'.format(col, train[col].isnull().sum()*100/train[col].shape[0]))

## MSZoning: Identifies the general zoning classification of the sale.
		
- A  : Agriculture
- C  : Commercial
- FV : Floating Village Residential
- I	 : Industrial
- RH : Residential High Density
- RL : Residential Low Density
- RP : Residential Low Density Park 
- RM : Residential Medium Density

In [None]:
mszoning = train[train['MSZoning'].isnull() == False]['MSZoning']

In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    ax = sns.countplot(x=mszoning, orient="v", order=['RL', 'RM', 'FV', 'RH', 'C (all)'])
    ax.set_title('Occurence counts of various zone types', size=20)
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('MSZoning', size=15)
    ps = ((mszoning.value_counts() / mszoning.shape[0]) * 100).values
    for i, p in enumerate(ax.patches):
        ax.annotate('{0:.2f}%'.format(ps[i]), (p.get_x() + 0.20, p.get_height()+20))
    ax.set_yticks(np.arange(0, 1400, 100))
    fig.add_subplot(ax)
    fig.set_size_inches(10, 5)

### Remarks:
- 77.7% of all the properties that have been considered for selling are residential areas with low population density.
- 15.78% of all the properties that have been considered for selling are residential areas with medium population density.
- We can infer from the above data that there is a high demand of residential area which have medium to low densities

## LotFrontage: Linear feet of street connected to property

In [None]:
lotfrontage = train[train['LotFrontage'].isnull() == False]['LotFrontage']

In [None]:
lotfrontage.describe()

In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    ax = sns.distplot(a=lotfrontage, kde=False, color='purple', bins=70)
    ax.set_title('Distribution of LotFrontage', size=20)
    ax.set_yticks(np.arange(0, 200, 20))
    ax.set_xticks(np.arange(0, 350, 20))
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('LotFrontage (feet)', size=15)
    fig.add_subplot(ax)
    fig.set_size_inches(10, 5)

In [None]:
np.sort(lotfrontage.values)[-20:]

### Remarks:
- Most of the properties have street of lenght 60 to 80 feet connected to it.
- It is highly likely there are two outliers which have a 313 feet of street connected to the property. 

## LotArea: Lot size in square feet

In [None]:
lotarea = train[train['LotArea'].isnull() == False]['LotArea']

In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    ax = sns.distplot(a=lotarea, kde=False, color='blue', bins=200)
    ax.set_title('Distribution of LotArea', size=20)
    ax.set_xticks(np.arange(0, 250000, 5000))
    ax.set_xlim(0, 60000)
    ax.set_yticks(np.arange(0, 500, 50))
    ax.set_ylim(0, 250)
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('LotArea (square feet)', size=15)
    fig.add_subplot(ax)
    fig.set_size_inches(10, 5)

### Remarks:
- Most of the properties sold have a lot area of 5000 to 1000 square feet.
- We could conclude that large properties are sold rarely.

## Alley: Type of alley access to property
- Grvl:	Gravel
- Pave: Paved
- NA: 	No alley access

In [None]:
alley = train['Alley']
alley = alley.fillna('No Alley Access')
alley = alley.replace('Grvl', 'Gravel')
alley = alley.replace('Pave', 'Paved')

In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    ax = sns.countplot(x=alley, order=['No Alley Access', 'Gravel', 'Paved'], palette='Set2')
    ax.set_title('Occurence counts of Alley types', size=20)
    ax.set_yticks(np.arange(0, 1700, 200))
    ax.set_ylim(0, 1600)
    ps = ((alley.value_counts() / alley.shape[0]) * 100).values
    for i, p in enumerate(ax.patches):
        ax.annotate('{0:.2f}%'.format(ps[i]), (p.get_x() + 0.25, p.get_height() + 50))
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('Alley', size=15)
    fig.add_subplot(ax)
    fig.set_size_inches(10, 5)

### Remarks:
- Most of the properties that have been sold do not have an alley access.
- This could mean having an alley access could either be undesirable in general.

## OverallQual: Rates the overall material and finish of the house
- 10:	Very Excellent
- 9:	Excellent
- 8:	Very Good
- 7:	Good
- 6:	Above Average
- 5:	Average
- 4:	Below Average
- 3:	Fair
- 2:	Poor
- 1:	Very Poor


In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("darkgrid"):
    fig = plt.figure()
    ax = sns.countplot(x="OverallQual", data=train, palette=sns.light_palette("green", 10))
    ax.set_title('Occurence counts of Quality types', size=20)
    ax.set_yticks(np.arange(0, 500, 20))
    ax.set_ylim(0, 450)
    ps = ((train['OverallQual'].value_counts()[list(range(1, 11))] / train['OverallQual'].shape[0]) * 100).values
    for i, p in enumerate(ax.patches):
        ax.annotate('{0:.2f}%'.format(ps[i]), (p.get_x() + 0.1, p.get_height() + 5))
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('OverallQual', size=15)
    fig.add_subplot(ax)
    fig.set_size_inches(10, 10)

### Remarks:
- The graph shows an expected behaviour; while a poor quality properly rarely gets sold, an average quality one would be sold the most. Properties with excellent quality would come with a high price tag and would also be sold lot less compared to an average property.


## YearBuilt: Original construction date


In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    ax = sns.distplot(a=train["YearBuilt"], kde=False, color='red')
    ax.set_title('Distribution of YearBuilt', size=20)
    ax.set_yticks(np.arange(0, 400, 50))
    ax.set_xticks(np.arange(1860, 2017, 10))
    ax.set_ylim(0, 350)
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('YearBuilt', size=15)
    fig.add_subplot(ax)
    fig.set_size_inches(10, 5)

### Remarks:
- Houses that have been built in recent times are more likely to be sold.
- The trend does not apply to the houses built around 1980 till 1995, the reasons for which needs further investigating.

## RoofStyle: Type of roof
- Flat
- Gable	
- Hip	
- Mansard	
- Shed	


In [None]:
sns.set(font_scale=1.2)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    order=['Gable', 'Hip', 'Flat', 'Gambrel', 'Mansard', 'Shed']
    ax = sns.countplot(x="RoofStyle", data=train, order=order, palette='Set1')
    ax.set_title('Occurence counts of Roof types', size=20)
    ax.set_yticks(np.arange(0, 1300, 100))
    ax.set_ylim(0, 1300)
    ps = ((train['RoofStyle'].value_counts() / train['RoofStyle'].shape[0]) * 100).values
    for i, p in enumerate(ax.patches):
        ax.annotate('{0:.2f}%'.format(ps[i]), (p.get_x() + 0.1, p.get_height() + 20))
    ax.set_ylabel('No.of Houses', size=15)
    ax.set_xlabel('RoofStyle', size=15)
    fig.add_subplot(ax)
    fig.set_size_inches(10, 5)

### Remarks:
- Most of the houses sold have Gable roofs, or Gable is are the most common type of roof, followed by Hip roofs (by a large margin)

## Exterior1st: Exterior covering on house
- AsbShng:	Asbestos Shingles
- AsphShn:	Asphalt Shingles
- BrkComm:	Brick Common
- BrkFace:	Brick Face
- CBlock:	Cinder Block
- CemntBd:	Cement Board
- HdBoard:	Hard Board
- ImStucc:	Imitation Stucco
- MetalSd:	Metal Siding
- Other:	Other
- Plywood:	Plywood
- PreCast:	PreCast	
- Stone:	Stone
- Stucco:	Stucco
- VinylSd:	Vinyl Siding
- Wd Sdng:	Wood Siding
- WdShing:	Wood Shingles

In [None]:
sns.set(font_scale=1.5)
with sns.axes_style("whitegrid"):
    fig = plt.figure()
    ps = ((train['Exterior1st'].value_counts() / train['Exterior1st'].shape[0]) * 100)
    vals = np.sort(ps.values)[::-1]
    order = ps.index.values[np.argsort(ps.values)[::-1]].tolist()
    ax = sns.countplot(x="Exterior1st", order=order, data=train, palette='Set2')
    ax.set_title('Occurence counts of Exterior Material types', size=27)
    ax.set_yticks(np.arange(0, 600, 50))
    ax.set_ylim(0, 550)
    ax.set_xlabel(xlabel="Exterior1st", size=23)
    for i, p in enumerate(ax.patches):
        ax.annotate('{0:.2f}%'.format(ps[i]), (p.get_x() + 0.07, p.get_height() + 4))
    ax.set_ylabel('No.of Houses', size=23)
    fig.add_subplot(ax)
    fig.set_size_inches(20, 10)

### Remarks:
- While Vinyl Siding remains the popular choice for external covering of the house, hard boards, metal siding and wooden sidings are the next best choices.