In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

from keras.layers import Dense, Flatten, Input
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras import Sequential
import keras
from sklearn.preprocessing import OneHotEncoder

from keras.layers import Dropout
import h5py  # compress and save features

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Loading data

In [2]:
%%time
with h5py.File('features_train.h5', 'r') as f:
    features = np.array(f['features'])

Wall time: 5min 18s


In [3]:
features.shape

(244768, 9736)

In [None]:
%%time
with h5py.File('features_all_train.h5', 'r') as f:
    feature_combine = np.array(f['feature_all'])

In [None]:
feature_combine.shape

In [4]:
%%time
with h5py.File('features_salary.h5', 'r') as f:
    salary = np.array(f['salary'])

Wall time: 414 ms


In [5]:
salary.shape

(244768,)

# PCA analysis

In [None]:
def zeroMean(dataMat):        
    meanVal=np.mean(dataMat,axis=0)     #get mean by columns
    newData=dataMat-meanVal  
    return newData,meanVal


def percentage2n(eigVals,percentage):  
    sortArray=np.sort(eigVals)          #ascend order 
    sortArray=sortArray[-1::-1]           
    arraySum=sum(sortArray)  
    tmpSum=0  
    num=0  
    for i in sortArray:  
        tmpSum+=i  
        num+=1  
        if tmpSum>=arraySum*percentage:  
            return num  


def pca(dataMat,percentage=0.99):  
    newData,meanVal=zeroMean(dataMat)  
    covMat=np.cov(newData,rowvar=0)      #covariance  
    eigVals,eigVects=np.linalg.eig(np.mat(covMat))      
    n=percentage2n(eigVals,percentage)                 #need n dimensionol data to get the convariance percentage
    eigValIndice=np.argsort(eigVals)            #ascend order 
    n_eigValIndice=eigValIndice[-1:-(n+1):-1]   
    n_eigVect=eigVects[:,n_eigValIndice]        
    lowDDataMat=newData*n_eigVect               #lower dimensional data  
    #reconMat=(lowDDataMat*n_eigVect.T)+meanVal  #reconstruct data  
    return lowDDataMat,n

**dimention too big to use this pca function**

# create train and test datasets

In [None]:
x_train = features[0:122384]
x_train.shape

In [None]:
y_test = features[122384:244768]
y_test.shape

In [None]:
y_train = salary[0:122384]
y_train.shape

In [None]:
y_test = salary[122384:244768]
y_test.shape

In [None]:
del features, salary

# build model

In [None]:
del model

In [6]:
# create model
model = Sequential()
model.add(Dense(3000,input_dim=9736, activation='relu',use_bias=True))
#model.add(Dense(4000,input_dim=9736, activation='relu',use_bias=True)) # without fulldescription
model.add(Dropout(0.5))
model.add(Dense(1500, activation='relu',use_bias=True))
model.add(Dropout(0.5))
model.add(Dense(1000, activation='relu',use_bias=True))
model.add(Dropout(0.5))
model.add(Dense(1))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 3000)              29211000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 3000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1500)              4501500   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1500)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1000)              1501000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000)              0         
_________________________________________________________________
den

In [None]:
# create model
model_1 = Sequential()
model_1.add(Dense(3000,input_dim=11817, activation='relu',use_bias=True))
#model.add(Dense(4000,input_dim=9736, activation='relu',use_bias=True)) # without fulldescription
model_1.add(Dropout(0.5))
model_1.add(Dense(1500, activation='relu',use_bias=True))
model_1.add(Dropout(0.5))
model_1.add(Dense(1000, activation='relu',use_bias=True))
model_1.add(Dropout(0.5))
model_1.add(Dense(1))
# Compile model
model_1.compile(loss='mean_squared_error', optimizer='adam')
print(model_1.summary())

In [7]:
# Change log_dir for differnt activation function, change the number for every run.
tbCallBack = keras.callbacks.TensorBoard(log_dir='./4000-1000-adam/', histogram_freq=0, write_graph=True, write_images=True)

**GTX960M  batch_size:1024  using around 50% of memory**

In [None]:
del hist

In [8]:
hist = model.fit(features, salary, batch_size=1024, epochs=200, shuffle=True,verbose=2,validation_split=0.2, callbacks = [tbCallBack])  

Train on 195814 samples, validate on 48954 samples
Epoch 1/200
 - 93s - loss: 367800436.0561 - val_loss: 191666416.2870
Epoch 2/200
 - 92s - loss: 175479727.3515 - val_loss: 177935801.3645
Epoch 3/200
 - 97s - loss: 164581185.2571 - val_loss: 172532111.0705
Epoch 4/200
 - 94s - loss: 158888142.2047 - val_loss: 168789190.7335
Epoch 5/200
 - 97s - loss: 154390113.6989 - val_loss: 167414591.9961
Epoch 6/200
 - 96s - loss: 150005641.3007 - val_loss: 164842823.6774
Epoch 7/200
 - 98s - loss: 146553810.4757 - val_loss: 163814662.0609
Epoch 8/200
 - 95s - loss: 141936764.8917 - val_loss: 163039940.4679
Epoch 9/200
 - 88s - loss: 138266565.7395 - val_loss: 161890443.5223
Epoch 10/200
 - 97s - loss: 134370369.7301 - val_loss: 161950086.2093
Epoch 11/200
 - 94s - loss: 129977194.4560 - val_loss: 160975604.3548
Epoch 12/200
 - 94s - loss: 125808518.3338 - val_loss: 162989022.0351
Epoch 13/200
 - 95s - loss: 122125092.6884 - val_loss: 166866115.2239
Epoch 14/200


KeyboardInterrupt: 

In [None]:
with h5py.File('train_models.h5', 'w') as f:
    f['model_1'] = hist