In [21]:
%%writefile BuildUserProfiles.py


import pandas as pd
import numpy as np
import os
import glob
import random
import csv
import data
import util
from data import Vocab
from tqdm import tqdm
import shutil
import collections
import statistics



from datetime import datetime as dt
from datetime import timedelta
from sklearn.model_selection import train_test_split



final_file = '/data_data/session_length/nishanth01/data/user_profile_cluster.csv'

country_vocab_file = '/data_data/session_length/nishanth01/data/country_vocab.csv'
tracks_vocab_file = '/data_data/session_length/nishanth01/data/tracks_vocab.csv'
artist_vocab_file = '/data_data/session_length/nishanth01/data/artist_vocab.csv'
user_sessions = '/data_data/session_length/nishanth01/data/final/train/{0}.csv'


def get_users():
    columns = ['user_id']
    users = pd.read_csv('/data_data/session_length/nishanth01/data/unique_users.txt',names=columns)
    return users


def get_user_details(user_id):
    file_name = user_sessions.format(user_id) 
    columns = ['user','current','start','session_id',
               'prev_session_length','avg_session_length',
               'gender','age','country','registered',
               'track_duration','times_played','artist','track','session_length']
    
    complete_files = glob.glob(file_name)
    user_data = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
    return user_data

    

#user,gender,age,country,registered,top_artist,top_track,top_genre,total_sessions,avg_session_length
def create_user(user_id,tracks_vocab,artist_vocab):
    user_data = []
    session_lengths = {}
    
    try:
        user = get_user_details(user_id)
        total_count = len(user.index)

        age = 0
        gender = 0
        country = 0
        registered = 0
        top_artist = 0
        top_track = 0
        total_sessions = 0
        avg_session_length = 0
        user_id = 0

        tracks_counter = collections.Counter()
        artist_counter = collections.Counter()
        max_session_length  = 0

        for row in tqdm(user.iterrows(),total = total_count):
            user_id = row[1]['user']
            artist_id = row[1]['artist']
            track_id = row[1]['track']
            age = row[1]['age']
            gender = row[1]['gender']
            country = row[1]['country']
            registered = row[1]['registered']
            session_id = row[1]['session_id']
            session_length = row[1]['session_length']
            avg_session_length = row[1]['avg_session_length']
            
            session_lengths[session_id]  = session_length
            
            if(total_sessions < session_id):
                total_sessions = session_id

            if(artist_id != artist_vocab.word2id(data.UNKNOWN_TOKEN)):
                artist_counter.update([artist_id])
            
            if(track_id != tracks_vocab.word2id(data.UNKNOWN_TOKEN)):     
                input_str = str(track_id) + '@@@' + str(artist_id)
                tracks_counter.update([input_str])


        top_artist = artist_counter.most_common(1)[0][0]
        top_track, top_track_art = tracks_counter.most_common(1)[0][0].split('@@@')

        max_session_length = max(session_lengths.itervalues())
        median_session_length = statistics.median(session_lengths.itervalues())
        
        user_data.append(user_id)#user
        user_data.append(gender)#gender id
        user_data.append(age)#age
        user_data.append(country)#country id
        user_data.append(registered)#registered time
        user_data.append(top_artist)#top artist
        user_data.append(float(top_track))#top track
        user_data.append(total_sessions)#total sessions
        user_data.append(avg_session_length)#avg session length
        user_data.append(max_session_length)#max session length
        user_data.append(median_session_length)#median session length
        user_data.append(total_count)#total session data count
        


#         artist = artist_vocab.id2word(top_artist)
#         track = tracks_vocab.id2word(float(top_track))
#         track_art = artist_vocab.id2word(float(top_track_art)) 
#         print('Top artist: {0} ;; Top Track: {1} ({2})'.format(artist,track,track_art)) 
#        print('COMPLETED: {0}'.format(user_id))
    except Exception as e:
        print(e)
        
    return user_data
    
    
def process(tracks_vocab,artist_vocab):
    print('Starting..')
    try:
        os.remove(final_file) 
    except OSError as e:
        pass
    
    failed = []
    try:
        i = 0
        users = get_users()
        with open(final_file,'w+') as data_out:
            writer = csv.writer(data_out,quoting=csv.QUOTE_NONNUMERIC,delimiter='\t')
            for row in users.iterrows():
                try:
                    user_id = row[1]['user_id']
                    user_data = create_user(user_id,tracks_vocab,artist_vocab)
                    writer.writerow(user_data)
                except Exception as e:
                    print(e)
                    failed.append(user_id)
                    pass
                i += 1
#                 if(i == 10):
#                     break
    except Exception as e:
        print('EXCEPTION 0 :::',e)
    finally:
        #print('FAILED Users: ',failed)
        pass
        
    print('COMPLETE!')    


if __name__ == '__main__':
    tracks_vocab = Vocab(tracks_vocab_file)
    artist_vocab = Vocab(artist_vocab_file)
    process(tracks_vocab,artist_vocab)    

Overwriting BuildUserProfiles.py


In [None]:
#!python BuildUserProfiles.py

In [53]:
!tail -10 /data_data/session_length/nishanth01/data/user_profile_cluster.csv

991.0	0.0	0.0	11.0	1138492800.0	829.0	774987.0	255.0	4278.49	24040.0	2680.0	3304
992.0	1.0	0.0	14.0	1164153600.0	719.0	485731.0	215.0	3180.76	60649.0	1390.0	2425
993.0	1.0	0.0	11.0	1157500800.0	740.0	45195.0	2678.0	3195.15	69968.0	1866.5	31738
994.0	1.0	0.0	0.0	1145318400.0	249.0	25209.0	561.0	3330.26	35591.0	2014.0	7037
995.0	0.0	0.0	16.0	1137888000.0	250.0	3110.0	1249.0	9117.67	227854.0	3805.0	48916
996.0	0.0	0.0	5.0	1153094400.0	15.0	10519.0	108.0	10524.15	88606.0	4759.0	4939
997.0	1.0	0.0	5.0	1167955200.0	21278.0	495946.0	19.0	139375.78	1272222.0	5865.0	9505
998.0	1.0	0.0	6.0	1127865600.0	15.0	28332.0	1842.0	3561.16	32793.0	2041.5	21039
999.0	0.0	0.0	7.0	1185235200.0	20.0	640337.0	1225.0	6562.31	57662.0	4376.0	31414
1000.0	1.0	0.0	5.0	1174694400.0	141.0	914185.0	971.0	4943.69	29410.0	3626.0	18372


In [None]:
!pwd

In [51]:
!tail -10 nohup.out

  0%|          | 0/6540 [00:00<?, ?it/s]  9%|8         | 568/6540 [00:00<00:01, 5671.61it/s] 21%|##1       | 1394/6540 [00:00<00:00, 6963.56it/s] 34%|###4      | 2247/6540 [00:00<00:00, 7484.39it/s] 48%|####7     | 3108/6540 [00:00<00:00, 7764.60it/s] 61%|######    | 3972/6540 [00:00<00:00, 7939.43it/s] 74%|#######3  | 4830/6540 [00:00<00:00, 8044.99it/s] 87%|########7 | 5693/6540 [00:00<00:00, 8127.97it/s]100%|##########| 6540/6540 [00:00<00:00, 7885.32it/s]
  0%|          | 0/20760 [00:00<?, ?it/s]  3%|2         | 566/20760 [00:00<00:03, 5656.32it/s]  7%|6         | 1426/20760 [00:00<00:02, 7125.19it/s] 11%|#1        | 2291/20760 [00:00<00:02, 7633.10it/s] 15%|#5        | 3153/20760 [00:00<00:02, 7879.56it/s] 19%|#9        | 4017/20760 [00:00<00:02, 8030.40it/s] 24%|##3       | 4883/20760 [00:00<00:01, 8134.48it/s] 28%|##7       | 5746/20760 [00:00<00:01, 8204.73it/s] 32%|###1      | 6615/20760 [00:00<00:01, 8264.77it/s] 36%|###6      | 7481/20760 [00:00<00:01, 83