In [1]:
import numpy as np
import matplotlib.pyplot as plt
#to plot figures inline
%matplotlib inline
import pandas as pd
import seaborn as sns
import tensorflow as tf
import warnings

warnings.filterwarnings('ignore')

In [2]:
# ******************************** 1.Data exploration and Visualization *****************************

def get_train_set():
    df = pd.read_csv("csv-data-files/HousePrices/train.csv")
    labels = df.SalePrice
    df.drop(['SalePrice'], axis=1, inplace=True)
    return labels, df

def get_test_set():
    return None, pd.read_csv("csv-data-files/HousePrices/test.csv")

labels, df = get_test_set() #when applying for testing, change this to get_test_set

Ids = df.Id
df.drop(['Id'], axis=1, inplace=True) #drop Id column as it is not needed for training

#some basic info about size of data
print("number of records: ", len(df))
print("shape of main data frame :", df.shape)

df_numarical = df.select_dtypes(exclude=['object']) #extract only numarical features
df_numarical.isnull().sum()

('number of records: ', 1459)
('shape of main data frame :', (1459, 79))


MSSubClass         0
LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       78
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [3]:
#lets first apply mean value for null values in lot frontage and GarageYrBlt.
df_numarical.LotFrontage.fillna(df_numarical.LotFrontage.mean(), inplace=True) #fill NA (Not Available) values
df_numarical.GarageYrBlt.fillna(df_numarical.GarageYrBlt.median(), inplace=True)
df_numarical.MasVnrArea.fillna(df_numarical.MasVnrArea.mean(), inplace=True)
df_numarical.isnull().sum()

MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       1
BsmtFinSF2       1
BsmtUnfSF        1
TotalBsmtSF      1
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     2
BsmtHalfBath     2
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       1
GarageArea       1
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [4]:
df_categorical = df.select_dtypes(include=['object']) # select object (string) dtype columns
df_categorical.isnull().sum()

MSZoning            4
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           44
BsmtCond           45
BsmtExposure       44
BsmtFinType1       42
BsmtFinType2       42
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
KitchenQual         1
Functional          2
FireplaceQu       730
GarageType         76
GarageFinish       78
GarageQual         78
GarageCond         78
PavedDrive          0
PoolQC           1456
Fence            1169
MiscFeature      1408
SaleType            1
SaleCondition       0
dtype: int64

In [5]:
#now we can remove columns like Alley, PoolQC, Fence, MiscFeature
df_categorical.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1, inplace=True)

In [6]:
df_categorical.isnull().sum()

MSZoning          4
Street            0
LotShape          0
LandContour       0
Utilities         2
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       1
Exterior2nd       1
MasVnrType       16
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         44
BsmtCond         45
BsmtExposure     44
BsmtFinType1     42
BsmtFinType2     42
Heating           0
HeatingQC         0
CentralAir        0
Electrical        0
KitchenQual       1
Functional        2
GarageType       76
GarageFinish     78
GarageQual       78
GarageCond       78
PavedDrive        0
SaleType          1
SaleCondition     0
dtype: int64

In [7]:
# lets only select few categorical data now as following
#['MSZoning', 'Alley', 'Utilities', 'BldgType', 'HouseStyle', 'Exterior1st', 'BsmtCond', 'KitchenQual', 'GarageType']
df_categorical_selected = df_categorical.loc[:,['MSZoning','Utilities','BldgType','HouseStyle','BsmtCond','KitchenQual','GarageType']]
df_categorical_selected.head()

Unnamed: 0,MSZoning,Utilities,BldgType,HouseStyle,BsmtCond,KitchenQual,GarageType
0,RH,AllPub,1Fam,1Story,TA,TA,Attchd
1,RL,AllPub,1Fam,1Story,TA,Gd,Attchd
2,RL,AllPub,1Fam,2Story,TA,TA,Attchd
3,RL,AllPub,1Fam,2Story,TA,Gd,Attchd
4,RL,AllPub,TwnhsE,1Story,TA,Gd,Attchd


In [8]:
df_categorical_selected.isnull().sum()

MSZoning        4
Utilities       2
BldgType        0
HouseStyle      0
BsmtCond       45
KitchenQual     1
GarageType     76
dtype: int64

In [9]:
print("MSZoning mode() : ", df_categorical_selected.MSZoning.mode())
print("Utilities mode() : ", df_categorical_selected.Utilities.mode())
print("KitchenQual mode() : ", df_categorical_selected.KitchenQual.mode())
print("BsmtCond mode() : ",df_categorical_selected.BsmtCond.mode())
print("GarageType mode() : ", df_categorical_selected.GarageType.mode())

('MSZoning mode() : ', 0    RL
dtype: object)
('Utilities mode() : ', 0    AllPub
dtype: object)
('KitchenQual mode() : ', 0    TA
dtype: object)
('BsmtCond mode() : ', 0    TA
dtype: object)
('GarageType mode() : ', 0    Attchd
dtype: object)


In [10]:
df_categorical_selected.MSZoning.fillna('RL', inplace=True)
df_categorical_selected.Utilities.fillna('AllPub', inplace=True)
df_categorical_selected.KitchenQual.fillna('TA', inplace=True)
df_categorical_selected.BsmtCond.fillna('TA', inplace=True)
df_categorical_selected.GarageType.fillna('Attchd', inplace=True)
df_categorical_selected.isnull().sum()

MSZoning       0
Utilities      0
BldgType       0
HouseStyle     0
BsmtCond       0
KitchenQual    0
GarageType     0
dtype: int64

In [11]:
#  *************************   changing categorical data into numbers. ************************
print(len(df_categorical_selected.MSZoning.unique()))
df_categorical_selected.MSZoning = df_categorical_selected.MSZoning.astype('category', categories=['A','C','FV','I','RH','RL', 'RP', 'RM'])
MSZoning = pd.get_dummies(df_categorical_selected.MSZoning, prefix="MSZoning")
df_categorical_selected.drop(['MSZoning'], axis =1, inplace=True)
df_categorical_selected = df_categorical_selected.join(MSZoning)


5


In [12]:
###
print(len(df_categorical_selected.Utilities.unique()))
df_categorical_selected.Utilities = df_categorical_selected.Utilities.astype('category', categories=['AllPub','NoSewr','NoSeWa','ELO'])
Utilities = pd.get_dummies(df_categorical_selected.Utilities, prefix="Utilities")
df_categorical_selected.drop(['Utilities'], axis =1, inplace=True)
df_categorical_selected = df_categorical_selected.join(Utilities)


1


In [13]:
print(len(df_categorical_selected.BldgType.unique()), len(df_categorical_selected.HouseStyle.unique()))
###
df_categorical_selected.BldgType = df_categorical_selected.BldgType.astype('category', categories=['1Fam','2FmCon','Duplx','TwnhsE','TwnhsI'])
BldgType = pd.get_dummies(df_categorical_selected.BldgType, prefix="BldgType")
df_categorical_selected.drop(['BldgType'], axis=1, inplace=True)
df_categorical_selected = df_categorical_selected.join(BldgType)

df_categorical_selected.HouseStyle = df_categorical_selected.HouseStyle.astype('category', categories=['1Story','1.5Fin','1.5Unf','2Story','2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'])
HouseStyle = pd.get_dummies(df_categorical_selected.HouseStyle, prefix="HouseStyle")
df_categorical_selected.drop(['HouseStyle'], axis=1, inplace=True)
df_categorical_selected = df_categorical_selected.join(HouseStyle)


(5, 7)


In [14]:
###
print(len(df_categorical_selected.BsmtCond.unique()))
df_categorical_selected.BsmtCond = df_categorical_selected.BsmtCond.astype('category', categories=['Ex','Gd','TA','Fa','Po', 'NA'])
BsmtCond = pd.get_dummies(df_categorical_selected.BsmtCond, prefix="BsmtCond")
df_categorical_selected.drop(['BsmtCond'], axis=1, inplace=True)
df_categorical_selected = df_categorical_selected.join(BsmtCond)


4


In [15]:
print(len(df_categorical_selected.KitchenQual.unique()), len(df_categorical_selected.GarageType.unique()))

df_categorical_selected.KitchenQual = df_categorical_selected.KitchenQual.astype('category', categories=['Ex','Gd','TA','Fa','Po'])
KitchenQual = pd.get_dummies(df_categorical_selected.KitchenQual, prefix="KitchenQual")
df_categorical_selected.drop(['KitchenQual'], axis=1, inplace=True)
df_categorical_selected = df_categorical_selected.join(KitchenQual)

df_categorical_selected.GarageType = df_categorical_selected.GarageType.astype('category', categories=['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd','NA'])
GarageType = pd.get_dummies(df_categorical_selected.GarageType, prefix="GarageType")
df_categorical_selected.drop(['GarageType'], axis=1, inplace=True)
df_categorical_selected = df_categorical_selected.join(GarageType)
df_categorical_selected.head()

(4, 6)


Unnamed: 0,MSZoning_A,MSZoning_C,MSZoning_FV,MSZoning_I,MSZoning_RH,MSZoning_RL,MSZoning_RP,MSZoning_RM,Utilities_AllPub,Utilities_NoSewr,...,KitchenQual_TA,KitchenQual_Fa,KitchenQual_Po,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NA
0,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [16]:
labels

In [17]:
# Apply Min Max scalling function to numarical columns to get their data between 0 and 1 
def min_max(x, min, max):
    x = (x - min) / (max - min)
    return x

func = lambda x: min_max(x, float(x.min()), x.max()) 

df_numarical = df_numarical.astype(float)
df_numarical = df_numarical.apply(func)
df_numarical.head()

# Comment these when doing the prediction function
# Change the labels/ ground truth to Min Max scalling too
# labels = labels.astype(float)

min = 34900.0  #labels.min() #34900.0 #update these when predicting with noted values when training.
max = 755000.0  #labels.max() #755000.0


# labels = labels.map(lambda x: min_max(x, min, max))
# print ("min is :",min, " max is :",max, " max - min is :",(max-min)) #take a note of these.

In [18]:
def get_correct_prediction(n):
#     print("n is :", n[0][0][0])
#     print("type of n[0][0][0] :", type(n[0][0][0].item()))
    real = n[0].item() * (max - min) + min
    return real

In [19]:
#Join numarical and categorical data to be a one DataFrame again
df = df_numarical.merge(df_categorical_selected, left_index = True, right_index = True)
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,KitchenQual_TA,KitchenQual_Fa,KitchenQual_Po,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NA
0,0.0,0.329609,0.184147,0.444444,0.625,0.625954,0.183333,0.0,0.116708,0.094364,...,1,0,0,0,1,0,0,0,0,0
1,0.0,0.335196,0.232124,0.555556,0.625,0.603053,0.133333,0.083721,0.230175,0.0,...,0,0,0,0,1,0,0,0,0,0
2,0.235294,0.296089,0.224197,0.444444,0.5,0.900763,0.8,0.0,0.197257,0.0,...,1,0,0,0,1,0,0,0,0,0
3,0.235294,0.318436,0.154326,0.555556,0.625,0.908397,0.8,0.015504,0.150125,0.0,...,0,0,0,0,1,0,0,0,0,0
4,0.588235,0.122905,0.064121,0.777778,0.5,0.862595,0.7,0.0,0.065586,0.0,...,0,0,0,0,1,0,0,0,0,0


In [20]:
import math
#placeholders
x = tf.placeholder(tf.float32, shape=[None, 79])
y = tf.placeholder(tf.float32, shape=[None, 1])

#settings
batch_size = 10
layer1_nodes = 30
layer2_nodes = 10
layer3_nodes = 2
output_nodes = 1
num_epochs = 5000

#create model
def model(data):
    layer_1={
        'weights' : tf.Variable(tf.random_normal([79, layer1_nodes])),
        'bias' : tf.Variable(tf.random_normal([layer1_nodes]))
    }
    
    layer_2={
         'weights':tf.Variable(tf.random_normal([layer1_nodes, layer2_nodes])),
         'bias' : tf.Variable(tf.random_normal([layer2_nodes]))
     }
    
    layer_3={
         'weights':tf.Variable(tf.random_normal([layer2_nodes, layer3_nodes])),
         'bias' : tf.Variable(tf.random_normal([layer3_nodes]))
     }
    
    output_layer={
        'weights': tf.Variable(tf.random_normal([layer3_nodes, output_nodes])),
        'bias' : tf.Variable(tf.random_normal([output_nodes]))
    }
    
    operation_layer1 = tf.add(tf.matmul(data, layer_1['weights']), layer_1['bias'])
    operation_layer1 = tf.nn.leaky_relu(operation_layer1)
    
    operation_layer2 = tf.add(tf.matmul(operation_layer1, layer_2['weights']), layer_2['bias'])
    operation_layer2 = tf.nn.leaky_relu(operation_layer2)
    
    operation_layer3 = tf.add(tf.matmul(operation_layer2, layer_3['weights']), layer_3['bias'])
    operation_layer3 = tf.nn.leaky_relu(operation_layer3)
    
    operation_output = tf.add(tf.matmul(operation_layer3, output_layer['weights']), output_layer['bias'])
    return operation_output

In [21]:
def train(features):
    
    Y = np.array(labels.values) #convert to numpy.ndarray
    X = df.values #convert to numpy.ndarray
    print("shape of Y is " +str(Y.shape), " and shape of X is ",str(X.shape))

    #make labels in correct shape
    Y = Y.reshape(1460,1)
    print("shape of Y is after reshapeing " +str(Y.shape))
    
    #split data for training and testing
    split_size = int(1 * len(Y)) #when doing final training make int(1 * len(Y)) 
    train_x = X[:split_size]
    train_y = Y[:split_size]
    test_x = X[split_size:]
    test_y = Y[split_size:]
    print("train_y shape :",train_y.shape, "  train_x shpape :", train_x.shape)
    
    
    prediction = model(features)
    #calculate loss and define an optimizer
    loss = tf.reduce_mean(tf.square(prediction - y))
    optimizer = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
    
    #save file location and Saver object
    save_file = './HousePrices_train_model.ckpt'
    saver = tf.train.Saver()
    
    loss_array = []
    #start a session
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(num_epochs):
            epoch_loss = 0
  
            #handle batches
            i = 0
            while i < len(X):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])  
                batch_y = np.array(train_y[start:end])
                _, l = sess.run([optimizer, loss], feed_dict={x:batch_x, y:batch_y})

                loss_array.append(l)
                if not math.isnan(l):
                    epoch_loss += l

                i += batch_size
            
            print('Epoch', epoch, ' completed out of ', num_epochs, ' loss: ', epoch_loss)
        
        testing_loss = tf.reduce_mean(tf.square(prediction - y))
        
        print('testing_loss : ', sess.run([testing_loss], feed_dict={x:test_x, y:test_y}))
        #save the model (this save process should done within the session)
        saver.save(sess, save_file)
        print("Trained model saved") 

    print(" ")
    print("loss array lenght : ",len(loss_array))
    plt.plot(loss_array)
    plt.show()


In [22]:
def find_neg(x):
    if x < 0:
        x = x * (-1)
    return x

func = lambda x: find_neg(x)

def predict(features):
    prediction_array =[]
    X = df.values #convert to numpy.ndarray
    
    prediction = model(features) # operation to get predictions using the trained model.
    
    #saved file location to retriev Saver object
    save_file = './HousePrices_train_model.ckpt'
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, save_file)
        
        j = 1
        while j <= len(X):
            predict = sess.run([prediction], feed_dict={x: X[j-1:j]})
            prediction_array.append(get_correct_prediction(predict))
            j += 1
    
    
    
    predic_series = pd.Series(prediction_array)
    predic_series = predic_series.map(func)
    predic_series.fillna(predic_series.mean(), inplace=True)


    ids = Ids.astype(int)

    df1 = pd.DataFrame(ids)
    df1.columns = ['Id']

    df2 = pd.DataFrame(predic_series)
    df2.columns = ['SalePrice']

    df1 = df1.merge(df2, left_index = True, right_index = True)
    df1.to_csv('HousePrice_solution_file.csv', index=False)
    print(df1.head())
    print("saved the csv file")

In [23]:
# call the necessary functions here
# train(x)
predict(x)

INFO:tensorflow:Restoring parameters from ./HousePrices_train_model.ckpt
     Id      SalePrice
0  1461  125537.906331
1  1462  206789.288154
2  1463  183616.558610
3  1464  216349.085969
4  1465  209698.783906
saved the csv file
