In [8]:
%%writefile data.py

import os, os.path,sys
import glob
import random
import struct
import csv
from tensorflow.core.example import example_pb2
csv.field_size_limit(sys.maxsize)



SQUENCE_START = '<s>'
SQUENCE_END = '</s>'
START_DECODING = '[START]' 
STOP_DECODING = '[STOP]' 
UNKNOWN_TOKEN = '[UNK]'


class Vocab(object):
    
    def __init__(self, vocab_file):
        self._word_to_id = {}
        self._id_to_word = {}
        self._count = 0 

        for w in [UNKNOWN_TOKEN, START_DECODING, STOP_DECODING,SQUENCE_START,SQUENCE_END]:
            self._word_to_id[w] = self._count
            self._id_to_word[self._count] = w
            self._count += 1


        with open(vocab_file, 'r') as vocab_f:
            reader = csv.reader(vocab_f, delimiter='\t')
            try:
                for row in reader:
                    w = row[0]
                    if(w in [SQUENCE_START, SQUENCE_END, UNKNOWN_TOKEN,START_DECODING,STOP_DECODING]):
                        raise Exception('{0} should not be in the vocab file:'.format(w))
                    if w in self._word_to_id:
                        raise Exception('Duplicate word in vocabulary file: {0}'.format(w))

                    self._word_to_id[w] = self._count
                    self._id_to_word[self._count] = w
                    self._count += 1
            except Exception as e:
                print(row)
                raise e

        print("Vocabulary complete with {0} words. Last word added: {1}".format(self._count, 
                                                                               self._id_to_word[self._count-1]))

        
    def word2id(self, word):
        if word not in self._word_to_id:
            return self._word_to_id[UNKNOWN_TOKEN]
        return self._word_to_id[word]

    def id2word(self, word_id):
        if word_id not in self._id_to_word:
            raise ValueError('Id not found in vocab: {0}'.format(word_id))
        return self._id_to_word[word_id]

    def size(self):
        return self._count   

Overwriting data.py


In [9]:
%%writefile util.py

import glob
import random
import struct
import csv
from tensorflow.core.example import example_pb2
import data

import pandas as pd
import numpy as np
import os
import glob
import random
import csv
from tqdm import tqdm
import data

from datetime import datetime as dt
from datetime import timedelta
from sklearn.model_selection import train_test_split



def get_user_id(user_id,vocab):
    try:
        return int(user_id.split('_')[1])
    except Exception as e:
        return vocab.word2id(data.UNKNOWN_TOKEN)


def get_registered_time(registered,vocab):
    try:
        return int(dt.strptime(registered,'%b %d, %Y').strftime('%s'))
    except Exception as e:
        return vocab.word2id(data.UNKNOWN_TOKEN)        
        
    
def get_time(timestamp_str,vocab):
    try:
        return int(dt.strptime(timestamp_str,'%Y-%m-%dt%H:%M:%Sz').strftime('%s'))
    except Exception as e:
        return vocab.word2id(data.UNKNOWN_TOKEN)


def get_gender_id(gender):
    try:
        if(gender.lower() == 'm'):
            return 1 
        elif(gender.lower() == 'f'):
            return 0
        else:
            return -1
    except Exception as e:
        return -1
        

def get_age(age):
    try:
        return int(age)
    except Exception as e:
        return -1

def get_seconds(in_ms):
    return 0.001*in_ms


def get_times_played(time,duration):
    if(duration != 0):
        return time / duration
    else:
        return 0

    
def get_time_difference(t1,t2):
    time1 = dt.strptime(t1,'%Y-%m-%dt%H:%M:%Sz')
    time2 = dt.strptime(t2,'%Y-%m-%dt%H:%M:%Sz')
    return (time2-time1).total_seconds()


def max_session_window():
    return timedelta(minutes=30).total_seconds() 


def get_word_id(word,vocab):
    try:
        word = word.strip()
        if(word):
            return vocab.word2id(word.lower())
        else:
            return vocab.word2id(data.UNKNOWN_TOKEN)
    except Exception as e:    
        return vocab.word2id(data.UNKNOWN_TOKEN)
    
    
def get_start_sequence(vocab,length):
    return [vocab.word2id(data.SQUENCE_START) for i in range(length)]
        
    
def get_end_sequence(vocab,length):
    return [vocab.word2id(data.SQUENCE_END) for i in range(length)]




Overwriting util.py
