In [2]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [3]:
traindata = pd.read_csv('train.csv')
traindata.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
print("Data type : ", type(traindata))
print("Data dims : ", traindata.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (1460, 81)


In [5]:
traindata.info(max_cols=None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
traindata[traindata['GarageType'].isna()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
39,40,90,RL,65.0,6040,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,AdjLand,82000
48,49,190,RM,33.0,4456,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2009,New,Partial,113000
78,79,90,RL,72.0,10778,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,136500
88,89,50,C (all),105.0,8470,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,10,2009,ConLD,Abnorml,85000
89,90,20,RL,60.0,8070,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,123600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,1350,70,RM,50.0,5250,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,122000
1407,1408,20,RL,,8780,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2009,WD,Normal,112000
1449,1450,180,RM,21.0,1533,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2006,WD,Abnorml,92000
1450,1451,90,RL,60.0,9000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2009,WD,Normal,136000


In [5]:
traindata[traindata['GarageType'].notnull]

TypeError: notnull() takes 1 positional argument but 2 were given

In [None]:
traindata.info()

In [None]:
# Extract only the numeric data variables
trainnumdf = pd.DataFrame(traindata[["Id", "MSSubClass", "LotArea", "MoSold", "YrSold", "SalePrice"]])
trainnumdf.head()

In [None]:
trainnumdf.drop(['MoSold', 'YrSold', 'MSSubClass'], axis = 1) 
#traindata.select_dtypes(include='int64')

In [None]:
SalePriceData = pd.DataFrame(trainnumdf['SalePrice'])
SalePriceData.describe()

In [None]:
LotAreaData = pd.DataFrame(trainnumdf['LotArea'])
LotAreaData.describe()

In [None]:
# Set up matplotlib figure with three subplots
f, axes = plt.subplots(2, 3, figsize=(20, 15))

# Plot the basic uni-variate figures for HP
sb.boxplot(data = LotAreaData, orient = "h", ax = axes[0,0], color='g')
sb.histplot(data = LotAreaData, ax = axes[0,1])
sb.violinplot(data = LotAreaData, orient = "h", ax = axes[0,2], color='r')

# Plot the basic uni-variate figures for Attack
sb.boxplot(data = SalePriceData, orient = "h", ax = axes[1,0], color='g')
sb.histplot(data = SalePriceData, ax = axes[1,1])
sb.violinplot(data = SalePriceData, orient = "h", ax = axes[1,2], color='r')

In [None]:
# Create a joint dataframe by concatenating the two variables
trainjoint = pd.concat([LotAreaData, SalePriceData], axis = 1).reindex(LotAreaData.index) #reindex is to follow that index
trainjoint.head()

In [None]:
trainjoint.corr()

In [None]:
f= plt.figure(figsize=(12, 4))
sb.jointplot(data = trainjoint, x = "LotArea", y = "SalePrice")

In [None]:
# Extract only the numeric data variables
trainnum = pd.DataFrame(traindata[["LotArea", "LotFrontage", "SalePrice"]])
trainnum.head()

In [None]:
# Draw the distributions of all variables
f, axes = plt.subplots(3, 3, figsize=(20, 20))

count = 0
for var in trainnum:
    sb.boxplot(data = trainnum[var], orient = "h", ax = axes[count,0], color='g')
    sb.histplot(data = trainnum[var], ax = axes[count,1])
    sb.violinplot(data = trainnum[var], orient = "h", ax = axes[count,2], color='r')
    count += 1

In [None]:
sb.jointplot(data=trainnum, x="LotArea", y="LotFrontage", height=5)
sb.jointplot(data=trainnum, x="LotArea", y="SalePrice", height=5)
sb.jointplot(data=trainnum, x="LotFrontage", y="SalePrice", height=5)

NameError: name 'houseData' is not defined