In [1]:
import os

In [9]:
!ls /data_data/session_length/nishanth01/data/

1		   genres.csv	     userid-profile.csv
artist_vocab.csv   summary	     user_profile_cluster.csv
country_vocab.csv  tracks	     users
final		   tracks_vocab.csv  user_session_original.tsv
final.tar.gz	   unique_users.txt  user_sessions_old.csv


In [88]:
%%writefile BuildUserSessions.py


import pandas as pd
import numpy as np
import os
import glob
import random
import csv
import data
import util
from data import Vocab
from tqdm import tqdm
import shutil

from datetime import datetime as dt
from datetime import timedelta
from sklearn.model_selection import train_test_split



final_file = '/data_data/session_length/nishanth01/data/final/{0}/{1}.csv'
final_dir = '/data_data/session_length/nishanth01/data/final/'
country_vocab_file = '/data_data/session_length/nishanth01/data/country_vocab.csv'
tracks_vocab_file = '/data_data/session_length/nishanth01/data/tracks_vocab.csv'
artist_vocab_file = '/data_data/session_length/nishanth01/data/artist_vocab.csv'


def get_users(test=0.1,val=0.1):
    columns = ['user_id']
    users = pd.read_csv('/data_data/session_length/nishanth01/data/unique_users.txt',names=columns)
    _,test_df = train_test_split(users, test_size=test+val)
    test_users,val_users = train_test_split(test_df, test_size=(val)/(test+val))
    return users,test_users,val_users


def get_user_details(user_id):
    file_name = '/data_data/session_length/nishanth01/data/users/{0}/*.csv'.format(user_id) 
    columns = ['user_id','timestamp','artist_name',
               'track_name','gender','age','country',
               'registered','duration','genre']
    
    complete_files = glob.glob(file_name)
    user_data = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
    return user_data

    
def process_prev_row(row,curr_time,session_start,session_id,
                     session_length,
                     country_vocab,tracks_vocab,artist_vocab,
                     prev_length,total):
    data = []
    new_session = False
    prev_time = row[1]['timestamp']
    times_played = 0
    
    try:
        track_duration = util.get_seconds(float(row[1]['duration']))  
    except Exception:
        track_duration = 0

    try:
        diff = util.get_time_difference(prev_time,curr_time)
        
        if(track_duration > 0):
            if(diff <= track_duration):#same session
                session_length = session_length + diff
                times_played = util.get_times_played(diff,track_duration)
            else:
                if((diff-track_duration) > util.max_session_window()):#next is new session
                    session_length = session_length + track_duration
                    times_played = util.get_times_played(track_duration,track_duration)
                    new_session = True
                else:    
                    session_length = session_length + diff    
                    times_played = util.get_times_played(diff,track_duration)
        else:    
            if(diff > util.max_session_window()):#next is new session
                new_session = True
            else:    
                session_length = session_length + diff   

                
        data.append(util.get_user_id(row[1]['user_id'],country_vocab))#user
        data.append(util.get_time(row[1]['timestamp'],country_vocab))#current timestamp
        data.append(util.get_time(session_start,country_vocab))#start timestamp
        data.append(int(session_id))#session_id
        data.append(int(prev_length))#previous session length
        if((session_id != 0) and ((session_id - 1) != 0)):
            average = float("{0:.2f}".format((total/(session_id - 1))))
        else:
            average = float(0)
        data.append(average)#avg session length
        data.append(util.get_gender_id(row[1]['gender']))#gender id
        data.append(util.get_age(row[1]['age']))#age
        data.append(util.get_word_id(row[1]['country'],country_vocab))#country id
        data.append(util.get_registered_time(row[1]['registered'],country_vocab))#registered time
        data.append(track_duration)#track_duration
        data.append(float("{0:.2f}".format(times_played)))#times_played        
        data.append(util.get_word_id(row[1]['artist_name'],artist_vocab))#artist
        data.append(util.get_word_id(row[1]['track_name'],tracks_vocab))#track
        data.append(int(session_length))#session_length

    except Exception as e:
        print(e)
        new_session = True
        
    return new_session,session_length,data



def create_user(user_id,test,val,country_vocab,tracks_vocab,artist_vocab):
    user = get_user_details(user_id)
    user = user.sort_values(by=['timestamp'])
    total_count = len(user.index)
    test_index = int(total_count*random.uniform(0.5, 1))
    
    test_file = ''
    train_file = final_dir+'train/{0}.csv'.format(user_id)
    if(test):
        test_file = final_dir+'test/{0}.csv'.format(user_id)
    else:
        test_file = final_dir+'validate/{0}.csv'.format(user_id)    

        
    session_length = 0
    prev_length = 0
    total = 0
    
    session_id = 1
    curr_time = ''
    prev_time = ''
    session_start = ''
    new_session = False
    
    i = 0
    delete = False
    if(not test and not val):
        delete = True
    
    with open(train_file,'w+') as train_out,open(test_file,'w+') as test_out:
        train_out = csv.writer(train_out,quoting=csv.QUOTE_NONNUMERIC,delimiter='\t')
        test_out = csv.writer(test_out,quoting=csv.QUOTE_NONNUMERIC,delimiter='\t')
        data = []
        
        try:
            for row in user.iterrows():
                try:
                    data = []
                    if(i == 0):#first time
                        session_id = 1
                        session_start = row[1]['timestamp']
                        prev_row = row
                    else:    
                        if(prev_row):
                            new_session,session_length,data = process_prev_row(prev_row,
                                                                               row[1]['timestamp'],
                                                                               session_start,
                                                                               session_id,
                                                                               session_length,
                                                                               country_vocab,
                                                                               tracks_vocab,
                                                                               artist_vocab,
                                                                               prev_length,
                                                                               total)
                            prev_row = row
                            
                        else:
                            raise Exception('Unhandled error..!')

                        if(new_session):
                            total = total + session_length
                            prev_length = session_length
                            session_id = session_id + 1
                            session_length = 0
                            session_start = row[1]['timestamp']
                            write_flag = True


                    if(data):
                        if((test or val) and i > test_index):
                            test_out.writerow(data)
                        else:
                            train_out.writerow(data)     
                        data = []

                except Exception as e:
                    print('EXCEPTION(1): Skipping...',e)
                    pass
                i = i+1        
                

            if(data):
                if((test or val) and i > test_index):
                    test_out.writerow(data)
                else:
                    train_out.writerow(data)     

        except Exception as e:
            print('EXCEPTION(1.1): Skipping...',e)
            pass
            
    if(delete):
        try:
            os.remove(test_file) 
        except OSError:
            pass
        
    print('COMPLETED: {0}'.format(user_id))
    
    
def process(country_vocab,tracks_vocab,artist_vocab):
    print('Starting..')

    try:
        shutil.rmtree(final_dir) 
        os.mkdir(final_dir)
        os.mkdir(final_dir+'train')
        os.mkdir(final_dir+'test')
        os.mkdir(final_dir+'validate')
        
    except OSError as e:
        print(e)
        pass
    
    failed = []
    try:
        i = 0
        users,test_users,val_users = get_users(test=0.2,val=0.1)
        for row in users.iterrows():
            try:
                test = False
                val = False
                user_id = row[1]['user_id']
                if(len(test_users.loc[test_users['user_id'] == user_id].index)):
                    test = True
                elif (len(val_users.loc[val_users['user_id'] == user_id].index)):
                    val = True
                create_user(user_id,test,val,country_vocab,tracks_vocab,artist_vocab)
            except Exception as e:
                print(e)
                failed.append(user_id)
                pass
            i += 1
#             if(i == 4):
#                 break
    except Exception as e:
        print('EXCEPTION 0 :::',e)
    finally:
        print('FAILED Users: ',failed)
        
    print('COMPLETE!')    


if __name__ == '__main__':
    
    country_vocab = Vocab(country_vocab_file)
    tracks_vocab = Vocab(tracks_vocab_file)
    artist_vocab = Vocab(artist_vocab_file)
    process(country_vocab,tracks_vocab,artist_vocab)    

Overwriting BuildUserSessions.py


In [89]:
!chmod 777 BuildUserSessions.py
!rm -rf nohup.out

In [90]:
#!python BuildUserSessions.py

In [91]:
#nohup python BuildUserSessions.py &

In [51]:
#!ls

build_complete_vocab.ipynb  BuildUserSessions.py  data.py   nohup.out  util.pyc
build_session_data.ipynb    create_utility.ipynb  data.pyc  util.py


In [96]:
#!tail nohup.out

In [1]:
!ls /data_data/session_length/nishanth01/data/final/train

user_000001.csv  user_000257.csv  user_000507.csv  user_000752.csv
user_000002.csv  user_000258.csv  user_000508.csv  user_000753.csv
user_000003.csv  user_000259.csv  user_000509.csv  user_000754.csv
user_000004.csv  user_000260.csv  user_000510.csv  user_000755.csv
user_000005.csv  user_000261.csv  user_000511.csv  user_000757.csv
user_000006.csv  user_000262.csv  user_000512.csv  user_000759.csv
user_000007.csv  user_000263.csv  user_000513.csv  user_000760.csv
user_000008.csv  user_000264.csv  user_000514.csv  user_000761.csv
user_000009.csv  user_000265.csv  user_000515.csv  user_000762.csv
user_000010.csv  user_000266.csv  user_000516.csv  user_000763.csv
user_000011.csv  user_000267.csv  user_000517.csv  user_000764.csv
user_000012.csv  user_000268.csv  user_000518.csv  user_000765.csv
user_000013.csv  user_000269.csv  user_000519.csv  user_000766.csv
user_000014.csv  user_000270.csv  user_000520.csv  user_000768.csv
user_000015.csv  user_000271.csv  user_000521.cs

In [1]:
!head -1000 /data_data/session_length/nishanth01/data/final/train/user_000001.csv

1	1155477560	1155477560	1	0	0.0	1	0	31	1155427200	0.0	0.0	33115	90088	249
1	1155477809	1155477560	1	0	0.0	1	0	31	1155427200	0.0	0.0	33115	57792	683
1	1155478243	1155477560	1	0	0.0	1	0	31	1155427200	0.0	0.0	33115	112288	1100
1	1155478660	1155477560	1	0	0.0	1	0	31	1155427200	90.0	0.96	2301	329356	1186
1	1155478746	1155477560	1	0	0.0	1	0	31	1155427200	0.0	0.0	3312	28451	1423
1	1155478983	1155477560	1	0	0.0	1	0	31	1155427200	0.0	0.0	4260	270	2703
1	1155480263	1155477560	1	0	0.0	1	0	31	1155427200	296.0	2.2	4260	918	3354
1	1155480914	1155477560	1	0	0.0	1	0	31	1155427200	294.0	0.97	207	8719	3639
1	1155481199	1155477560	1	0	0.0	1	0	31	1155427200	309.0	1.04	15	14781	3960
1	1155481520	1155477560	1	0	0.0	1	0	31	1155427200	0.0	0.0	1741	44080	4085
1	1155481645	1155477560	1	0	0.0	1	0	31	1155427200	236.0	1.22	54033	523	4372
1	1155481932	1155477560	1	0	0.0	1	0	31	1155427200	327.0	0.99	40496	1077102	4695
1	1155482255	1155477560	1	0	0.0	1	0	31	1155427200	307.0	1.08	19611	906987	5