# Imports

In [None]:
import keras 
import pandas as pd 
import numpy as np 
import io

# Data Upload

In [None]:
AllSeattleData  = pd.read_csv('seattle_01.csv')

# Preprocessing

## Show Data

In [None]:
pd.set_option('display.max_column', None)
print(AllSeattleData.head(10))

In [None]:
print(AllSeattleData.info())

In [None]:
columnIndexToRemove = [0,1,2,4,5,11,17, 14, 15, 16]
SeattleData  = AllSeattleData.drop(columns=AllSeattleData.iloc[:,columnIndexToRemove].columns)
print("Removed DataFrame")
print(AllSeattleData.iloc[:,[*columnIndexToRemove]].info())

In [None]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
EncodedSeattleData = SeattleData
EncodedSeattleData.loc[:, ['room_type']] = EncodedSeattleData.loc[:, ['room_type']].apply(le.fit_transform)
encoded = to_categorical(SeattleData['room_type'])
newLabels = ["room_type{}".format( x ) for x in range(0, len(encoded[0]))]
dfToAdd = pd.DataFrame(encoded,columns=newLabels)
EncodedSeattleData = pd.concat([EncodedSeattleData, dfToAdd], axis=1)

In [None]:
print(SeattleData.info())

In [None]:
EncodedSeattleData.drop(columns='room_type', inplace=True)

for allColumns in EncodedSeattleData.columns:
    EncodedSeattleData[allColumns].fillna((EncodedSeattleData[allColumns].mean()), inplace=True)

In [None]:
print(EncodedSeattleData.info())

In [None]:
cols = list(EncodedSeattleData)
# move the column to head of list using index, pop and insert
cols.insert(10, cols.pop(cols.index('price')))
EncodedSeattleData = EncodedSeattleData.loc[:, cols]

In [None]:
print(EncodedSeattleData)

# EDA

In [None]:
import seaborn as sns

sns.distplot(SeattleData['price'],rug=True, hist=False)

## Categorical Freq Plot

In [None]:
sns.countplot(SeattleData['room_type'])

In [None]:
myInts = set(SeattleData.columns)
myInts.remove('room_type')
print(myInts)
for variable in myInts:
    sns.jointplot(x=SeattleData[variable],y=SeattleData['price'])

## Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corr = EncodedSeattleData.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=1)

In [None]:
# <125 is 75% of the data.  This was found in EncodedSeattleData.describe() 
print(EncodedSeattleData[EncodedSeattleData['price'] >=125].count())

# Neural Net, forward Selection model optimization

In [None]:
from keras import Sequential, optimizers, callbacks
from keras import losses
from keras.layers import Dense
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import math
scalaFSelect  = ['overall_satisfaction', 'accommodates', 'bedrooms', 'bathrooms']

In [None]:
def buildModel(modelName,nfeat,act_func='tanh',optimzer=optimizers.adam(lr=0.1)):
    model = Sequential()

    if modelName == 'perceptron':
        model.add(Dense(8,input_dim=nfeat,activation=act_func, use_bias=True))
        model.add(Dense(1))  #1st Hidden Layer - 3 neuron

    if modelName == 'neuralnet_3L':
        model.add(Dense(5,input_dim=nfeat,activation=act_func, use_bias=True))               #1st Hidden Layer
        model.add(Dense(3,activation=act_func))
        model.add(Dense(1))                               #Output Layer
    if modelName == 'neuralnet_4L':
        model.add(Dense(8,input_dim=nfeat,activation=act_func, use_bias=True))               #1st Hidden Layer
        model.add(Dense(5,activation=act_func))
        model.add(Dense(2,activation=act_func))   #2nd Hidden Layer
        model.add(Dense(1, activation=keras.activations.linear))                               #Output Layer             #Output Layer
    model.compile(loss=losses.mean_squared_error, optimizer=optimzer)

    return model
def adj_r2(rSquare, numIndependentVar, numSamples):
    return 1-(1-rSquare)*(numSamples -1)/ (numSamples - numIndependentVar - 1)

def graphAttsVRVals(rsq, rCv, rbar, numParams, columnName):
    import matplotlib.pyplot as plt
    plt.plot(numParams, rsq, label="R^2")
    plt.plot( numParams, rCv, label="rCv")
    plt.plot(numParams, rbar,  label="rAdj")
    plt.xlabel("Number params")
    plt.ylabel("RVals")
    plt.title("RVals for Y={}".format(columnName))
    plt.legend()
    plt.show()
    plt.clf()

In [None]:
def forwardSelect( XIndices, colsToUse, xVals, yVals, modelParams, previousRBase):

    # param:  XIndices is an array of all incices of the X attributes
    # param: colsTouse is ab array of selected columns that the model should use for the base.
    # param: modelToUse is the model to predict with.  it should be passed with the object instantiated
    initList = XIndices
    featuresToUse = colsToUse
    # find what index we need to test.  e.i what is not already in the model
    featuresToTest = list(set(initList) - set(featuresToUse))

    # analyzer will the index of attribute that was tested, and the new r-value the model created with the added attribute
    analyzer = pd.Series(index=featuresToTest)

    # Get all the columns we need
    xInput = xVals.loc[:,[*featuresToUse]]
    # have to build
    #myBaseModel = buildModel(modelParams['model'], len(xInput.columns), modelParams['activation'], modelParams['optimizer'])
    # yVals = yVals.tolist()
    #myBaseModel.fit(x=xInput, y=yVals, epochs=modelParams['epochs'], verbose=1)

    myBaseRSquare = previousRBase
    print(xInput)
    print(yVals)
    for newColumn in featuresToTest:
        # add the feature we want to test to the base X attribtues  we are already using
        combinedX = featuresToUse + [newColumn]
        # create model with the indices we want to experiment with.

        myModel = buildModel(modelParams['model'], len(combinedX), modelParams['activation'], modelParams['optimizer'])
        myModel.fit(xVals.iloc[:,[*combinedX]], yVals, epochs=modelParams['epochs'], verbose=1) #modelToUse.fit(xVals.iloc[:,[*combinedX]], yVals)
        # add rsquare valeus at location of the index we are experimenting with.

        analyzer.loc[newColumn] = r2_score(yVals,myModel.predict(xVals.loc[:,[*combinedX]])) # this gets rsquare value.
        print(analyzer)
    #if there is a max value in anaylzer.  Let us add that index to the list if the rsquare is better than the current models.
    if (analyzer.max()):
        if (analyzer.max() > myBaseRSquare):
            # index of what column to add that gave best r-value, the actual r-value, rsquar-bar value, and rcv value
            return analyzer.idxmax(), analyzer.max()
    # else no added features is better than base model
    return -1, None

def forwardSelectAll(xAtts, yAtts, modelParams):
    xIndices = list( (range(0,len(xAtts.columns))))
    rSqVals = [-999] # R^2, R^2 Bar, R^2 cv
    rCvVals = [0]
    rBarVals = [0]
    cols = [0] # Question?:  does this value represent the 1's column.
    # going to iterate through each value in xIndexArray and pass to forwardSelectMethod to determine rVals
    numFeatures = [1]
    for i in range(0, len(xAtts.columns)):
        myY = yAtts
        next_jIdx, next_j = forwardSelect(xIndices, cols, xAtts, myY, modelParams, rSqVals[-1])
        if(next_jIdx ==-1):
            break # means we found all columns that are significant.
        cols.append(next_jIdx)
        numFeatures.append(numFeatures[-1]+1)
        rSqVals.append(next_j)# calcualte rsquare, rsquarebar, and rcv here.

        ## KFOLD R Value
        kfold = KFold(n_splits=5, shuffle=True)
        cvScoreArray = []
        for train, test in kfold.split(xAtts.iloc[:,[*cols]], yAtts):
            model = buildModel(modelParams['model'], numFeatures[-1], modelParams['activation'], modelParams['optimizer'])
            model.fit(xAtts.iloc[train, [*cols]], yAtts.iloc[train], verbose=0, epochs=modelParams['epochs'])
            cvScoreArray.append(r2_score(yAtts.iloc[test], model.predict(xAtts.iloc[test,[*cols]])))
        rCvVals.append(sum(cvScoreArray)/len(cvScoreArray))
        rBarVals.append(adj_r2(next_j, numFeatures[-1], len(yAtts)))
    return cols, numFeatures, rSqVals, rCvVals, rBarVals

## NNXL

## Forward Select.  Careful of time. 

In [None]:
myOptimizer = optimizers.SGD(learning_rate=.05)

myNNXLModelParams = {'model':'neuralnet_4L', 'optimizer':myOptimizer, 'activation': 'sigmoid', 'epochs':200, 'lr':.05}


myNNXLData = EncodedSeattleData.copy()
myNNXLData.columns = list(range(0, len(myNNXLData.columns)))
X= myNNXLData.iloc[:,:-1]
y= myNNXLData.iloc[:,-1]
X = pd.DataFrame(preprocessing.scale(X))
y = pd.DataFrame(preprocessing.scale(y.to_numpy().reshape(-1,1)))


myCOls, myNumFeat, rVals, rCV, Radj = forwardSelectAll(X, y, myNNXLModelParams)

In [None]:
# graphing
rVals.pop(0)
rVals.insert(0,0)
print(rVals)
print(Radj)
print(rCV)
graphAttsVRVals(rVals, rCV, Radj, myNumFeat, 'price')


In [None]:
myX = X.iloc[:, [*myCOls]]
print(myX)
myModel = buildModel(myNNXLModelParams['model'], len(myX.columns), myNNXLModelParams['activation'], optimzer=myNNXLModelParams['optimizer'] )
myModel.fit(x=myX, y=y, epochs=200, verbose=0)
_, ax = plt.subplots()
ax.scatter(x= range(0, len(myX)), y=y, c='blue', label='actual')
ax.scatter(x= range(0, len(myX)), y=myModel.predict(myX),c='red', label='predicted' )
plt.legend()
#plt.plot(myX, )

In [None]:
print(myModel.predict(myX))
print(r2_score(y, myModel.predict(myX)) )

In [None]:
print("Features that were selected:")
print(EncodedSeattleData.columns[[*myCOls]])