In [4]:
%%writefile CreateUserSummary.py


import pandas as pd
import numpy as np
import os
import glob
import random
import csv
import data
import util
from data import Vocab
from tqdm import tqdm

from datetime import datetime as dt
from datetime import timedelta
from sklearn.model_selection import train_test_split



summary_file = '/data_data/session_length/nishanth01/data/summary/{0}/{1}.csv'
final_dir = '/data_data/session_length/nishanth01/data/summary/'
vocab_file = '/data_data/session_length/nishanth01/data/country_vocab.csv'


def get_users(test=0.1,val=0.1):
    columns = ['user_id']
    users = pd.read_csv('/data_data/session_length/nishanth01/data/unique_users.txt',names=columns)
    _,test_df = train_test_split(users, test_size=test+val)
    test_users,val_users = train_test_split(test_df, test_size=(val)/(test+val))
    return users,test_users,val_users


def get_user_details(user_id):
    file_name = '/data_data/session_length/nishanth01/data/users/{0}/*.csv'.format(user_id) 
    columns = ['user_id','timestamp','artist_name',
               'track_name','gender','age','country',
               'registered','duration','genre']
    
    complete_files = glob.glob(file_name)
    user_data = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
    return user_data

    
def process_prev_row(row,curr_time,session_start,session_id,session_length,vocab,prev_length,total):
    data = []
    new_session = False
    prev_time = row[1]['timestamp']
    times_played = 0
    
    try:
        track_duration = util.get_seconds(float(row[1]['duration']))  
    except Exception:
        track_duration = 0

    try:
        diff = util.get_time_difference(prev_time,curr_time)
        
        if(track_duration > 0):
            if(diff <= track_duration):#same session
                session_length = session_length + diff
                times_played = util.get_times_played(diff,track_duration)
            else:
                if((diff-track_duration) > util.max_session_window()):#next is new session
                    session_length = session_length + track_duration
                    times_played = util.get_times_played(track_duration,track_duration)
                    new_session = True
                else:    
                    session_length = session_length + diff    
                    times_played = util.get_times_played(diff,track_duration)
        else:    
            if(diff > util.max_session_window()):#next is new session
                new_session = True
            else:    
                session_length = session_length + diff   
                
                
        data.append(util.get_time(session_start,vocab))#start timestamp
        data.append(util.get_user_id(row[1]['user_id'],vocab))#user
#        data.append(util.get_time(curr_time,vocab))#end timestamp
        data.append(int(session_id))#session_id
        data.append(util.get_gender_id(row[1]['gender']))#gender id
        data.append(util.get_age(row[1]['age']))#age
        data.append(util.get_word_id(row[1]['country'],vocab))#country id
        data.append(util.get_registered_time(row[1]['registered'],vocab))#registered time
        data.append(int(prev_length))#previous session length
        if((session_id != 0) and ((session_id - 1) != 0)):
            average = float("{0:.2f}".format((total/(session_id - 1))))
        else:
            average = float(0)
        data.append(average)#avg session length
        data.append(int(session_length))#session_length

    except Exception as e:
        print(e)
        new_session = True
        
    return new_session,session_length,data



def create_user(user_id,test,val,vocab):
    user = get_user_details(user_id)
    user = user.sort_values(by=['timestamp'])
    total_count = len(user.index)
    test_index = int(total_count*random.uniform(0.5, 1))
    write_flag = False
    
    test_file = ''
    train_file = final_dir+'train/{0}.csv'.format(user_id)
    if(test):
        test_file = final_dir+'test/{0}.csv'.format(user_id)
    else:
        test_file = final_dir+'validate/{0}.csv'.format(user_id)    

        
    session_length = 0
    prev_length = 0
    total = 0
    
    session_id = 1
    curr_time = ''
    prev_time = ''
    session_start = ''
    new_session = False
    
    i = 0
    delete = False
    if(not test and not  val):
        delete = True
    
    with open(train_file,'w+') as train_out,open(test_file,'w+') as test_out:
        train_out = csv.writer(train_out,quoting=csv.QUOTE_NONNUMERIC,delimiter='\t')
        test_out = csv.writer(test_out,quoting=csv.QUOTE_NONNUMERIC,delimiter='\t')
        data = []
        
        try:
            for row in user.iterrows():
                try:
                    data = []
                    if(i == 0):#first time
                        session_id = 1
                        session_start = row[1]['timestamp']
                        prev_row = row
#                         train_out.writerow(util.get_start_sequence(vocab,9))  
#                         if(test or val):
#                             test_out.writerow(util.get_start_sequence(vocab,9))     

                    else:    
                        if(prev_row):
                            new_session,session_length,data = process_prev_row(prev_row,
                                                                               row[1]['timestamp'],
                                                                               session_start,
                                                                               session_id,
                                                                               session_length,
                                                                               vocab,
                                                                               prev_length,
                                                                               total)
                            prev_row = row
                            
                        else:
                            raise Exception('Unhandled error..!')

                        if(new_session):
                            total = total + session_length
                            prev_length = session_length
                            session_id = session_id + 1
                            session_length = 0
                            session_start = row[1]['timestamp']
                            write_flag = True


                    if(data and write_flag):
                        if((test or val) and i > test_index):
                            test_out.writerow(data)
                        else:
                            train_out.writerow(data)     

                        write_flag = False
                        data = []

                except Exception as e:
                    print('EXCEPTION(1): Skipping...',e)
                    pass
                
                i = i+1        
                

            if(data):
                if((test or val) and i > test_index):
                    test_out.writerow(data)
                else:
                    train_out.writerow(data)     

#             if(test or val):
#                 test_out.writerow(util.get_start_sequence(vocab,9))     
#             train_out.writerow(util.get_end_sequence(vocab,9))       
        except Exception as e:
            print('EXCEPTION(1.1): Skipping...',e)
            pass
            
    if(delete):
        try:
            os.remove(test_file) 
        except OSError:
            pass
        
    print('COMPLETED: {0}'.format(user_id))
    
    
def process(vocab):
    print('Starting..')
    failed = []
    try:
        i = 0
        users,test_users,val_users = get_users()
        for row in users.iterrows():
            try:
                test = False
                val = False
                user_id = row[1]['user_id']
                if(len(test_users.loc[test_users['user_id'] == user_id].index)):
                    test = True
                elif (len(val_users.loc[val_users['user_id'] == user_id].index)):
                    val = True
                create_user(user_id,test,val,vocab)
            except Exception as e:
                failed.append(user_id)
                pass
            i += 1
#             if(i == 4):
#                 break
    except Exception as e:
        print('EXCEPTION 0 :::',e)
    finally:
        print('FAILED Users: ',failed)
        
    print('COMPLETE!')    


if __name__ == '__main__':
    vocab = Vocab(vocab_file)
    process(vocab)    



Overwriting CreateUserSummary.py


In [5]:
!chmod +x CreateUserSummary.py

In [6]:
!rm -rf /data_data/session_length/nishanth01/data/summary/train/*
!rm -rf /data_data/session_length/nishanth01/data/summary/test/*
!rm -rf /data_data/session_length/nishanth01/data/summary/validate/*
!rm -rf nohup.out

In [None]:
#!python CreateUserSummary.py

In [None]:
#nohup python CreateUserSummary.py &
#ps ax | grep CreateUserSummary.py

In [33]:
#!tail nohup.out

COMPLETED: user_000993
COMPLETED: user_000994
COMPLETED: user_000995
COMPLETED: user_000996
COMPLETED: user_000997
COMPLETED: user_000998
COMPLETED: user_000999
COMPLETED: user_001000
('FAILED Users: ', ['user_000028', 'user_000029', 'user_000031', 'user_000033', 'user_000040', 'user_000041', 'user_000053', 'user_000060', 'user_000062', 'user_000069', 'user_000074', 'user_000089', 'user_000091', 'user_000107', 'user_000112', 'user_000121', 'user_000122', 'user_000125', 'user_000135', 'user_000138', 'user_000142', 'user_000158', 'user_000162', 'user_000174', 'user_000183', 'user_000210', 'user_000219', 'user_000237', 'user_000249', 'user_000274', 'user_000281', 'user_000296', 'user_000341', 'user_000359', 'user_000362', 'user_000366', 'user_000371', 'user_000397', 'user_000423', 'user_000425', 'user_000427', 'user_000439', 'user_000442', 'user_000468', 'user_000491', 'user_000504', 'user_000577', 'user_000585', 'user_000592', 'user_000595', 'user_000606', 'user_000607', 'user_00

In [31]:
!ls  /data_data/session_length/nishanth01/data/summary/train/

user_000001.csv  user_000257.csv  user_000507.csv  user_000752.csv
user_000002.csv  user_000258.csv  user_000508.csv  user_000753.csv
user_000003.csv  user_000259.csv  user_000509.csv  user_000754.csv
user_000004.csv  user_000260.csv  user_000510.csv  user_000755.csv
user_000005.csv  user_000261.csv  user_000511.csv  user_000757.csv
user_000006.csv  user_000262.csv  user_000512.csv  user_000759.csv
user_000007.csv  user_000263.csv  user_000513.csv  user_000760.csv
user_000008.csv  user_000264.csv  user_000514.csv  user_000761.csv
user_000009.csv  user_000265.csv  user_000515.csv  user_000762.csv
user_000010.csv  user_000266.csv  user_000516.csv  user_000763.csv
user_000011.csv  user_000267.csv  user_000517.csv  user_000764.csv
user_000012.csv  user_000268.csv  user_000518.csv  user_000765.csv
user_000013.csv  user_000269.csv  user_000519.csv  user_000766.csv
user_000014.csv  user_000270.csv  user_000520.csv  user_000768.csv
user_000015.csv  user_000271.csv  user_000521.cs

In [2]:
!head /data_data/session_length/nishanth01/data/summary/train/user_000018.csv

1125067377	18	1	-1	22	6	1125014400	0	0.0	6076
1125085431	18	2	-1	22	6	1125014400	6076	6076.0	9611
1125156450	18	3	-1	22	6	1125014400	9611	7843.5	4982
1125174250	18	4	-1	22	6	1125014400	4982	6889.67	14239
1125243080	18	5	-1	22	6	1125014400	14239	8727.0	10443
1125257674	18	6	-1	22	6	1125014400	10443	9070.2	5359
1125328391	18	7	-1	22	6	1125014400	5359	8451.67	11775
1125421583	18	8	-1	22	6	1125014400	11775	8926.43	4015
1125433277	18	9	-1	22	6	1125014400	4015	8312.5	14443
1125508109	18	10	-1	22	6	1125014400	14443	8993.67	4959


In [17]:
!tail /data_data/session_length/nishanth01/data/summary/test/user_000008.csv

1240530636	8	262	1	26	39	1159401600	16222	11723.88	185
1240641231	8	263	1	26	39	1159401600	185	11679.84	3657
1240693904	8	264	1	26	39	1159401600	3657	11649.33	380
1240737810	8	265	1	26	39	1159401600	380	11606.65	201
1240742314	8	266	1	26	39	1159401600	201	11563.61	311
1240764831	8	267	1	26	39	1159401600	311	11521.3	12137
1240853592	8	268	1	26	39	1159401600	12137	11523.61	11722
1240872325	8	269	1	26	39	1159401600	11722	11524.35	26116
1240915813	8	270	1	26	39	1159401600	26116	11578.59	34470
1240997587	8	271	1	26	39	1159401600	34470	11663.38	2213
