In [1]:
%%writefile util.py

import os
import glob
import pandas as pd
import numpy as np
import shutil
from math import sqrt
from numpy import concatenate
from collections import namedtuple
from matplotlib import pyplot
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,GRU
from keras.models import model_from_json
from scipy.stats import gaussian_kde
from scipy.spatial import distance
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error
import tqdm
from tqdm import tqdm



def save_model(model, name):
    print('SAVING model: {0}'.format(name))    
    json_string = model.to_json()
    architecture = name+'_architecture.json' 
    weights = name+'_weights.h5'
    open(architecture, 'w').write(json_string)
    model.save_weights(weights)


def retrieve_model(name, weights=True):
    print('RETRIEVING model: {0}'.format(name))
    architecture = name + '_architecture.json' 
    model_saved = model_from_json(open(architecture).read())
    
    if weights:
        weights = name+'_weights.h5'    
        model_saved.load_weights(weights)
    return model_saved



def train_by_cluster(cluster,cluster_users,hps):
    count = len(cluster_users)    
    if(count < 50):
        return
    print('Training model: {0}:: users: {1}'.format(cluster,len(cluster_users)))
    train_files = get_files(cluster_users,hps.train_file)
    train_X, train_y  = get_cluster_data(train_files,hps)
    print('TRAIN:: ',train_X.shape,train_y.shape)

    epochs = hps.epochs 
    if(hps.epochs > 10):
        count = len(cluster_users)
        if(count > 400):
            epochs = 100
        elif(count  > 200):
            epochs = 350
        elif(count  > 75):
            epochs = 400
        
    
    # design network
    model = get_lstm(hps,train_X,hps.dropout[0])

    
    validate_files = get_files(cluster_users,hps.validate_file)
    if(validate_files):
        validate_X, validate_y  = get_cluster_data(validate_files,hps)
        print('VALIDATE:: ',validate_X.shape,validate_y.shape)    
        
        # fit network
        history = model.fit(train_X, train_y, 
                            epochs=epochs, 
                            batch_size=hps.batch_size, 
                            validation_data=(validate_X, validate_y), 
                            verbose=hps.verbose_level, 
                            shuffle=False)
        
        if(hps.plot_eval):
            pyplot.plot(history.history['val_loss'], label='test')
        
    else:        
        print('VALIDATE:: NONE')    
        validate_X, validate_y = [],[]
        
        # fit network
        history = model.fit(train_X, train_y, 
                            epochs=epochs, 
                            batch_size=hps.batch_size, 
                            verbose=hps.verbose_level, 
                            shuffle=False)
        
    # plot history
    if(hps.plot_eval):
        pyplot.plot(history.history['loss'], label='train')
        pyplot.legend()
        pyplot.show()   
    
    model_name = 'model_{0}'.format(cluster)
    model_file = hps.model_dir+model_name
    save_model(model,model_file)

    
def test_by_cluster(cluster,cluster_users,hps):
    
    count = len(cluster_users)    
    if(count < 50):
        return None,None,None
    print('Testing cluster: {0}:: users: {1}'.format(cluster,len(cluster_users)))
    model_name = 'model_{0}'.format(cluster)
    model_file = hps.model_dir+model_name
    model = retrieve_model(model_file)
    
    
    test_files = get_files(cluster_users,hps.test_file)
    if(test_files):
        test_X, test_y  = get_cluster_data(test_files,hps)
        print('TEST:: ',test_X.shape,test_y.shape)    
    
        # make a prediction
        yhat = model.predict(test_X)
        test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))

        # invert scaling for forecast
        inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
        inv_yhat = inv_yhat[:,0]

        # invert scaling for actual
        test_y = test_y.reshape((len(test_y), 1))
        inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
        inv_y = inv_y[:,0]    
        get_performace(inv_y,inv_yhat,test_X.shape[0],hps)
        return inv_y,inv_yhat,test_X.shape[0]
    else:
        return None,None,None
    

def train(hps):
    try:
        shutil.rmtree(hps.model_dir) 
        os.mkdir(hps.model_dir)
    except OSError as e:
        print(e)
        pass
    
    users = get_clusters(hps)
    i = 0
    for val in users:
        train_by_cluster(i,val,hps)
        i += 1
    return users    


def test(users,hps):
#    users = get_clusters(hps)
    i = 0
    examples = 0
    actual = []
    pred = []
    for val in users:
        y, yhat,samples = test_by_cluster(i,val,hps)
        if(y is not None):
            examples = examples + samples
            actual = np.concatenate((actual,y),axis=None)
            pred = np.concatenate((pred,yhat),axis=None)
        i += 1
#    return actual,pred,examples    
    get_performace(actual,pred,examples,hps)
        
        
def get_user_file(user_id,file_name):
    u_id = str(int(user_id)).zfill(6)
    file_id = 'user_{0}'.format(u_id)
    file_path = file_name.format(file_id)
    exists = os.path.isfile(file_path)
    if exists:
        return file_path
    else:
        return '' 


def get_files(cluster_users,file_path):
    files = []
    for uid in cluster_users:
        fname = get_user_file(uid,file_path)
        if(fname is not ''):
            files.append(fname)
    return files

    
def get_clusters(hps):
    columns = ['user','gender','age','country','registered',
                'artist','track','total_sessions','avg_session_length']
    complete_files = glob.glob(hps.data_file)
    dataset = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
    values = dataset.values
    Xpair = values[:,(0,8)]
    km = KMeans (n_clusters=hps.clusters, init='k-means++')
    clstrs = km.fit (Xpair)
    user_clusters = km.predict(Xpair) 
    users = []
    for i in range(5):
        x = []
        a = np.where(user_clusters==i)
        arr = Xpair[a]
        for usr,val in arr:
            x.append(usr)
        users.append(x)    
    return users        


def experiment(hps,dropout):
    train_X, train_y  = get_data(hps.train_file,hps)
    print('TRAIN:: ',train_X.shape,train_y.shape)


    validate_X, validate_y  = get_data(hps.validate_file,hps)
    print('VALIDATE:: ',validate_X.shape,validate_y.shape)    
    
    
    # design network
    if(hps.model_lstm):
        if(hps.layered):
            model = get_layered_lstm(hps,train_X,dropout)
        else:
            model = get_lstm(hps,train_X,dropout)
    else:    
        model = get_gru(hps,train_X,dropout)
        
    # fit network
    history = model.fit(train_X, train_y, 
                        epochs=hps.epochs, 
                        batch_size=hps.batch_size, 
                        validation_data=(validate_X, validate_y), 
                        verbose=hps.verbose_level, 
                        shuffle=False)
    # plot history
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.show()   
    
    
    test_X, test_y  = get_data(hps.test_file,hps)
    print('TEST:: ',test_X.shape,test_y.shape)    
    
    # make a prediction
    yhat = model.predict(test_X)
    test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))

    # invert scaling for forecast
    inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
    inv_yhat = inv_yhat[:,0]

    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
    inv_y = inv_y[:,0]
    
    return get_performace(inv_y,inv_yhat,test_X.shape[0],hps)



def get_baseline_data(file_name):
    columns = ['user','current','start','session_id',
               'prev_session_length','avg_session_length',
               'gender','age','country','registered',
               'track_duration','times_played','artist','track','session_length']

    complete_files = glob.glob(file_name)
    dataset = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
#     df_perc = np.percentile(dataset.session_length, [99.5])
#     dataset =  dataset[dataset.session_length < df_perc[0]]
#     dataset =  dataset[dataset.prev_session_length < df_perc[0]]
    print('DATA:: ',dataset.shape)

    
    return dataset   



def test_baseline(train,file_name):
   
    test  = get_baseline_data(file_name)
    train['session_length'] = train['session_length'].astype('float64') 
    _train = train.groupby(['user'])['session_length'].mean().to_dict()
    
    inv_yhat = []
    inv_y = []
    # make a prediction
    for row in tqdm(test.iterrows(),total=test.shape[0]):
        try:
            user = row[1]['user']
            pred = _train[user]
            inv_yhat.append(pred)
            inv_y.append(float(row[1]['session_length']))
        except Exception as e:    
            pass

    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    print('Test RMSE: %.3f' % rmse)
    
    mae = mean_absolute_error(inv_y, inv_yhat)
    print('Test MAE: %.3f' % mae)
    
    
def get_cluster_data(file_names,hps):
    columns = ['start','user','session_id','gender','age','country','registered',
               'prev_session_length','avg_session_length','session_length']
    
#     columns = ['user','current','start','session_id',
#                'prev_session_length','avg_session_length',
#                'gender','age','country','registered',
#                'track_duration','times_played','artist','track','session_length']
    dataset = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in file_names))
    dataset = dataset.dropna()
    
    values = dataset.values
    X = values[:,:-1]
    y = values[:,-1]    
    
    #3D - samples,timesteps,features
    X = X.reshape((X.shape[0], 1, X.shape[1]))
    return X,y


def get_data(file_name,hps):
    columns = ['user','current','start','session_id',
               'prev_session_length','avg_session_length',
               'gender','age','country','registered',
               'track_duration','times_played','artist','track','session_length']
    complete_files = glob.glob(file_name)
    dataset = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
    
    if(hps.filter_outliers):
        df_perc = np.percentile(dataset.session_length, [hps.upper_limit])
        dataset =  dataset[dataset.session_length < df_perc[0]]
        dataset =  dataset[dataset.prev_session_length < df_perc[0]]
        
#         df_perc = np.percentile(dataset.session_length, [hps.lower_limit])
#         dataset =  dataset[dataset.session_length > df_perc[0]]
#         dataset =  dataset[dataset.prev_session_length > df_perc[0]]
        
    dataset = dataset.dropna()
    #dataset = dataset.sort_values(by=['start'])  
    
    values = dataset.values
    X = values[:,:-1]
    y = values[:,-1]    
    
    #3D - samples,timesteps,features
    X = X.reshape((X.shape[0], 1, X.shape[1]))
    return X,y


def get_layered_lstm(hps,train_X,dropout):
    model = Sequential()
    
    model.add(LSTM(hps.layer_dims, 
                   input_shape=(train_X.shape[1], 
                                train_X.shape[2]),
                   return_sequences=True,
                   dropout=dropout))
    for i in range(hps.no_layers):
        model.add(LSTM(hps.hidden_dim, 
                       dropout=dropout))
    
    model.add(Dense(1))
    model.compile(loss=hps.loss_func, 
                  optimizer=hps.optimizer)
    return model

def get_lstm(hps,train_X,dropout):
    model = Sequential()
    
    model.add(LSTM(hps.hidden_dim, 
                   input_shape=(train_X.shape[1], 
                                train_X.shape[2]),
                   dropout=dropout))
    
    model.add(Dense(1))
    model.compile(loss=hps.loss_func, 
                  optimizer=hps.optimizer)
    return model


def get_gru(hps,train_X):
    model = Sequential()
    model.add(GRU(hps.hidden_dim, 
                   input_shape=(train_X.shape[1], 
                                train_X.shape[2])))
    
    model.add(Dense(1))
    model.compile(loss=hps.loss_func, 
                  optimizer=hps.optimizer)
    return model


def get_performace(y,y_hat,samples,hps):
    # calculate RMSE
    rmse = sqrt(mean_squared_error(y, y_hat))
    mae = mean_absolute_error(y, y_hat)
    norm = mae/hps.baseline_mae
    
    print('METRICS :: RMSE: {0} ; MAE: {1} ; Normalized MAE: {2}'.format(rmse,mae,norm))    
#     pyplot.figure()
#     pyplot.plot(y, label='actual')
#     pyplot.plot(y_hat, label='pred')
#     pyplot.legend()
#     pyplot.show()   
    
    return rmse,mae,norm


Overwriting util.py
