In [2]:
%%writefile BuildUserProfiles.py


import pandas as pd
import numpy as np
import os
import glob
import random
import csv
import data
import util
from data import Vocab
from tqdm import tqdm
import shutil
import collections


from datetime import datetime as dt
from datetime import timedelta
from sklearn.model_selection import train_test_split



final_file = '/data_data/session_length/nishanth01/data/user_profile_cluster.csv'

country_vocab_file = '/data_data/session_length/nishanth01/data/country_vocab.csv'
tracks_vocab_file = '/data_data/session_length/nishanth01/data/tracks_vocab.csv'
artist_vocab_file = '/data_data/session_length/nishanth01/data/artist_vocab.csv'
user_sessions = '/data_data/session_length/nishanth01/data/final/train/{0}.csv'


def get_users():
    columns = ['user_id']
    users = pd.read_csv('/data_data/session_length/nishanth01/data/unique_users.txt',names=columns)
    return users


def get_user_details(user_id):
    file_name = user_sessions.format(user_id) 
    columns = ['user','current','start','session_id',
               'prev_session_length','avg_session_length',
               'gender','age','country','registered',
               'track_duration','times_played','artist','track','session_length']
    
    complete_files = glob.glob(file_name)
    user_data = pd.concat((pd.read_csv(f,names=columns,sep='\t') for f in complete_files))
    return user_data

    

#user,gender,age,country,registered,top_artist,top_track,top_genre,total_sessions,avg_session_length
def create_user(user_id,tracks_vocab,artist_vocab):
    user_data = []
    
    try:
        user = get_user_details(user_id)
        total_count = len(user.index)

        age = 0
        gender = 0
        country = 0
        registered = 0
        top_artist = 0
        top_track = 0
        total_sessions = 0
        avg_session_length = 0
        user_id = 0

        tracks_counter = collections.Counter()
        artist_counter = collections.Counter()

        for row in tqdm(user.iterrows(),total = total_count):
            user_id = row[1]['user']
            artist_id = row[1]['artist']
            track_id = row[1]['track']
            age = row[1]['age']
            gender = row[1]['gender']
            country = row[1]['country']
            registered = row[1]['registered']
            session_id = row[1]['session_id']
            avg_session_length = row[1]['avg_session_length']

            if(total_sessions < session_id):
                total_sessions = session_id

            if(artist_id != artist_vocab.word2id(data.UNKNOWN_TOKEN)):
                artist_counter.update([artist_id])
            
            if(track_id != tracks_vocab.word2id(data.UNKNOWN_TOKEN)):     
                input_str = str(track_id) + '@@@' + str(artist_id)
                tracks_counter.update([input_str])


        top_artist = artist_counter.most_common(1)[0][0]
        top_track, top_track_art = tracks_counter.most_common(1)[0][0].split('@@@')


        user_data.append(user_id)#user
        user_data.append(gender)#gender id
        user_data.append(age)#age
        user_data.append(country)#country id
        user_data.append(registered)#registered time
        user_data.append(top_artist)#top artist
        user_data.append(float(top_track))#top track
        user_data.append(total_sessions)#total sessions
        user_data.append(avg_session_length)#avg session length


#         artist = artist_vocab.id2word(top_artist)
#         track = tracks_vocab.id2word(float(top_track))
#         track_art = artist_vocab.id2word(float(top_track_art)) 
#         print('Top artist: {0} ;; Top Track: {1} ({2})'.format(artist,track,track_art)) 
#        print('COMPLETED: {0}'.format(user_id))
    except Exception as e:
        print(e)
        
    return user_data
    
    
def process(tracks_vocab,artist_vocab):
    print('Starting..')
    try:
        os.remove(final_file) 
    except OSError as e:
        pass
    
    failed = []
    try:
        i = 0
        users = get_users()
        with open(final_file,'w+') as data_out:
            writer = csv.writer(data_out,quoting=csv.QUOTE_NONNUMERIC,delimiter='\t')
            for row in users.iterrows():
                try:
                    user_id = row[1]['user_id']
                    user_data = create_user(user_id,tracks_vocab,artist_vocab)
                    writer.writerow(user_data)
                except Exception as e:
                    print(e)
                    failed.append(user_id)
                    pass
                i += 1
#                 if(i == 10):
#                     break
    except Exception as e:
        print('EXCEPTION 0 :::',e)
    finally:
        #print('FAILED Users: ',failed)
        pass
        
    print('COMPLETE!')    


if __name__ == '__main__':
    tracks_vocab = Vocab(tracks_vocab_file)
    artist_vocab = Vocab(artist_vocab_file)
    process(tracks_vocab,artist_vocab)    

Overwriting BuildUserProfiles.py


In [None]:
#!python BuildUserProfiles.py

In [17]:
!tail /data_data/session_length/nishanth01/data/user_profile_cluster.csv

991.0	0.0	0.0	11.0	1138492800.0	829.0	774987.0	255.0	4278.49
992.0	1.0	0.0	14.0	1164153600.0	719.0	485731.0	215.0	3180.76
993.0	1.0	0.0	11.0	1157500800.0	740.0	45195.0	2678.0	3195.15
994.0	1.0	0.0	0.0	1145318400.0	249.0	25209.0	561.0	3330.26
995.0	0.0	0.0	16.0	1137888000.0	250.0	3110.0	1249.0	9117.67
996.0	0.0	0.0	5.0	1153094400.0	15.0	10519.0	108.0	10524.15
997.0	1.0	0.0	5.0	1167955200.0	21278.0	495946.0	19.0	139375.78
998.0	1.0	0.0	6.0	1127865600.0	15.0	28332.0	1842.0	3561.16
999.0	0.0	0.0	7.0	1185235200.0	20.0	640337.0	1225.0	6562.31
1000.0	1.0	0.0	5.0	1174694400.0	141.0	914185.0	971.0	4943.69


In [10]:
!pwd

/home/nishanth01/model2


In [16]:
!tail -1000 nohup.out

COMPLETED: user_000999
COMPLETED: user_001000
('FAILED Users: ', ['user_000028', 'user_000029', 'user_000031', 'user_000033', 'user_000040', 'user_000041', 'user_000053', 'user_000060', 'user_000062', 'user_000069', 'user_000074', 'user_000089', 'user_000091', 'user_000107', 'user_000112', 'user_000121', 'user_000122', 'user_000125', 'user_000135', 'user_000138', 'user_000142', 'user_000158', 'user_000162', 'user_000174', 'user_000183', 'user_000210', 'user_000219', 'user_000237', 'user_000249', 'user_000274', 'user_000281', 'user_000296', 'user_000341', 'user_000359', 'user_000362', 'user_000366', 'user_000371', 'user_000397', 'user_000423', 'user_000425', 'user_000427', 'user_000439', 'user_000442', 'user_000468', 'user_000491', 'user_000504', 'user_000577', 'user_000585', 'user_000592', 'user_000595', 'user_000606', 'user_000607', 'user_000611', 'user_000661', 'user_000670', 'user_000681', 'user_000685', 'user_000706', 'user_000710', 'user_000730', 'user_000740', 'user_000750', 'u

  0%|          | 0/78149 [00:00<?, ?it/s]  1%|1         | 867/78149 [00:00<00:08, 8661.17it/s]  2%|2         | 1783/78149 [00:00<00:08, 8910.06it/s]  3%|3         | 2702/78149 [00:00<00:08, 9003.21it/s]  5%|4         | 3630/78149 [00:00<00:08, 9070.83it/s]  6%|5         | 4556/78149 [00:00<00:08, 9106.92it/s]  7%|7         | 5488/78149 [00:00<00:07, 9141.77it/s]  8%|8         | 6426/78149 [00:00<00:07, 9174.92it/s]  9%|9         | 7357/78149 [00:00<00:07, 9191.20it/s] 11%|#         | 8288/78149 [00:00<00:07, 9204.05it/s] 12%|#1        | 9226/78149 [00:01<00:07, 9220.95it/s] 13%|#2        | 10159/78149 [00:01<00:07, 9230.14it/s] 14%|#4        | 11094/78149 [00:01<00:07, 9239.62it/s] 15%|#5        | 12025/78149 [00:01<00:07, 9244.57it/s] 17%|#6        | 12964/78149 [00:01<00:07, 9254.81it/s] 18%|#7        | 13901/78149 [00:01<00:06, 9261.86it/s] 19%|#8        | 14840/78149 [00:01<00:06, 9269.36it/s] 20%|##        | 15773/78149 [00:01<00:06, 9272.62it/s] 21%|##1       |

  0%|          | 0/8223 [00:00<?, ?it/s] 11%|#1        | 908/8223 [00:00<00:00, 9070.56it/s] 22%|##1       | 1793/8223 [00:00<00:00, 8959.80it/s] 33%|###3      | 2721/8223 [00:00<00:00, 9066.43it/s] 44%|####4     | 3649/8223 [00:00<00:00, 9119.24it/s] 56%|#####5    | 4581/8223 [00:00<00:00, 9159.09it/s] 67%|######7   | 5526/8223 [00:00<00:00, 9206.21it/s] 78%|#######8  | 6454/8223 [00:00<00:00, 9216.01it/s] 90%|########9 | 7383/8223 [00:00<00:00, 9225.08it/s]100%|##########| 8223/8223 [00:00<00:00, 9176.90it/s]
  0%|          | 0/11415 [00:00<?, ?it/s]  8%|7         | 909/11415 [00:00<00:01, 9085.72it/s] 16%|#6        | 1829/11415 [00:00<00:01, 9141.38it/s] 24%|##4       | 2759/11415 [00:00<00:00, 9191.64it/s] 32%|###2      | 3690/11415 [00:00<00:00, 9219.69it/s] 41%|####      | 4625/11415 [00:00<00:00, 9245.41it/s] 49%|####8     | 5564/11415 [00:00<00:00, 9268.68it/s] 57%|#####6    | 6505/11415 [00:00<00:00, 9288.49it/s] 65%|######5   | 7447/11415 [00:00<00:00, 930

  0%|          | 0/33849 [00:00<?, ?it/s]  3%|2         | 896/33849 [00:00<00:03, 8954.63it/s]  5%|5         | 1818/33849 [00:00<00:03, 9086.73it/s]  8%|8         | 2747/33849 [00:00<00:03, 9151.97it/s] 11%|#         | 3677/33849 [00:00<00:03, 9187.47it/s] 14%|#3        | 4606/33849 [00:00<00:03, 9206.09it/s] 16%|#6        | 5544/33849 [00:00<00:03, 9234.21it/s] 19%|#9        | 6478/33849 [00:00<00:02, 9249.29it/s] 22%|##1       | 7416/33849 [00:00<00:02, 9265.35it/s] 25%|##4       | 8357/33849 [00:00<00:02, 9280.51it/s] 27%|##7       | 9292/33849 [00:01<00:02, 9287.23it/s] 30%|###       | 10231/33849 [00:01<00:02, 9296.54it/s] 33%|###3      | 11174/33849 [00:01<00:02, 9307.55it/s] 36%|###5      | 12116/33849 [00:01<00:02, 9315.59it/s] 39%|###8      | 13057/33849 [00:01<00:02, 9322.19it/s] 41%|####1     | 13997/33849 [00:01<00:02, 9327.15it/s] 44%|####4     | 14945/33849 [00:01<00:02, 9336.39it/s] 47%|####6     | 15889/33849 [00:01<00:01, 9342.29it/s] 50%|####9     |