In [1]:
import pandas as pd
from pymongo import MongoClient
from math import isnan, sqrt
import copy
from dateutil.parser import parse
import time

## Connect MongoDB using pymongo; Process the data and put in database
* Connect database

In [2]:
client = MongoClient('mongodb://localhost:27017')
db = client.recommendation
user_friends = db['user_friends']
test = db['test']
event_info = db['event_info']
attendance = db['event_attendees']
user_info = db['user_info']
train = db['train']

## 1.1 Process user location info
* Process location data in user info: handling null values and outliers; Put user location info into dictinary

In [3]:
users = pd.read_csv('/home/ruifan/Downloads/event-recommendation-engine-challenge/users.csv')


# loc_dict: key is user_id, value is dict(key:'tokens',value: list of user location info consists of loc tokens
# no null value in tokens

loc_dict = {}
for user in users.iterrows():
   
    user = user[1]
    uid = int(user['user_id'])
    if isinstance(user['location'], str):
        tokens = user['location'].split('  ')
        
        # filter out null value and non-locational info
        if not tokens:
            continue
        filtered = []
        for t in tokens:
            if t:
                try:
                    t = int(t)
                    continue
                except:
                    pass
                if t == 'undefined':
                    continue
                t = t.strip()
                filtered.append(t)
        if not filtered:
            continue
        loc_dict[uid] = {'tokens': filtered}

## 1.2 Process event location info
* Process location data in event info; Put event id and location info into dictionary (loc_dict2); put event coordinates and location info into latlngdict

In [None]:
loc_dict2 = {}
events_chunks = pd.read_csv("/home/ruifan/Downloads/event-recommendation-engine-challenge/events.csv", 
                            iterator=True, chunksize=10000)

# loc_dict2: key is event id, value is a dict consists of location info(lat, lng, city, country, city)
# in loc_dict2, at least one of location info(coorinate, city, state, country) is not null
# latlngdict: key is coordinate(lng and lat in a tuple), value is location info(city, country, city)

loc_dict2 = {}
latlngdict = {}
row_count = 0
count = 0
loc_dict2_rows = 0
for events in events_chunks:
    row_count += 1
    for e in events.iterrows():
        e = e[1]
        eid = int(e['event_id'])
        city = e['city']
        state = e['state']
        country = e['country']
        lat = e['lat']
        lng = e['lng']
        
        # Within pandas, a missing value is denoted by NaN
        # NaN is a special floating-point value
        if isinstance(city, float):
            city = None
        
        if isinstance(state, float):
            state = None
        if isinstance(country, float):
            country = None
            
        # If one of them(lat and lng) is null, coordinate info is invalid.
        if isnan(lat) or isnan(lng):
            lat = None
            lng = None
            
        # one of the location infos must be not null
        if not (city or state or country or lat or lng):
            continue
            
        # put location info(city,state,country,lat,lng) into a dictionary
        d = {}
        if city:
            d['city'] = city
        if state:
            d['state'] = state
        if country:
            if country == 'Democratic Republic Congo':
                country = 'Democratic Republic of the Congo'
            d['country'] = country
        d['lat'] = lat
        d['lng'] = lng
        
        # if lat and lng are not null and city is not null, ouput lat and lng(key)  
        # and location info into latlngdict.
        if lat:
            if city:
                latlngdict[(lat, lng)] = (city, country, state)
            else:
                count += 1
        loc_dict2[eid] = d


 

* Approximate coordinates and put to new dictionary (latlngapprox)
* Invert latlngapprox dict

In [None]:
# laglngapprox: key is tuple of coordinate, value is set of loc info(tuple).
latlngapprox = {}
for coord, loc in latlngdict.items():
    # round coordinates to 1 decimal place
    lat = int(coord[0] * 5) / 5.0
    lng = int(coord[1] * 5) / 5.0
    c2 = (lat,lng)
    if c2 not in latlngapprox:
        latlngapprox[c2] = set()
    if loc not in latlngapprox[c2]:
        latlngapprox[c2].add(loc)
        

# invert latlngapprox dictionary and put inverted coodinates info to new dictionary(locationlatlng)
# locationlatlng: key is loc info, value is corresponding coordinate
locationlatlng = {}
for coord, locs in latlngapprox.items():
    for loc in locs:
        locationlatlng[loc] = coord
       

* Impute incomplete loc info.

In [None]:
# match lag lng pairs without city location
# event_location: key is eid, value is a tuple of loc info(city,country,state)
# there are more than one loc info for each eid, because multiple addresses can be paired with one coordinate info
event_location = {}
missed = 0
for eid, edict in loc_dict2.items():

    lmin = None
    
    # is lat and lng are not null
    if edict['lat'] and edict['lng']:
        ecoord = (int(edict['lat'] * 5) / 5.0, int(edict['lng'] * 5) / 5.0)
        
        # if city is not null
        if ecoord in latlngapprox:
            lmin = latlngapprox[ecoord]

        # if city is null, find the nearest location(impute city info)
        else:
            dmin = 5
            vicinity = [i / 5.0 for i in range(-10, 11, 2)]
            for i in vicinity:
                for j in vicinity:
                    
                    coords = (ecoord[0] + i, ecoord[1] + j)
                    
                    # distance from original coordinate
                    if coords in latlngapprox:
                        dist = sqrt((coords[0] - edict['lat'])**2 + \
                                (coords[1] - edict['lng'])**2)
                        
                        # find the nearst loc
                        if dist < dmin:
                            dmin = dist
                            lmin = latlngapprox[coords]
                            
    # if lat and lng are null
    if not lmin:
        l = (edict.get('city'), edict.get('country'), edict.get('state'))
        
        # if the loc is in locationlatling dict, find the coordinate
        if l in locationlatlng:
            coords = locationlatlng[l]
            
            # fill the city info if it 
            lmin = latlngapprox[coords]
            
        # missed counts: coordinate info are null and no way to impute
        # missed = 5337
        else:
            lmin = [l]
            missed += 1
    event_location[eid] = lmin

## 1.3 Set up a city: country dictionary

In [None]:
# put countries(event) in a set
countries = set([])
for vl in event_location.values():
    for v in vl:
        if v[1]:
            countries.add(v[1])
            
# cities: key is coumtry name, value is set of citys in the country(event)
cities = {c: set() for c in countries}
for vl in event_location.values():
    for v in vl:
        if v[0] and v[1]:
            cities[v[1]].add(v[0])
            

# list main countries with states            
us_states = {'AL':'Alabama','AK':'Alaska','AZ':'Arizona','AR':'Arkansas','CA':'California','CO':'Colorado',
             'CT':'Connecticut','DE':'Delaware','FL':'Florida','GA':'Georgia','HI':'Hawaii','ID':'Idaho',
             'IL':'Illinois','IN':'Indiana','IA':'Iowa','KS':'Kansas','KY':'Kentucky','LA':'Louisiana',
             'ME':'Maine','MD':'Maryland','MA':'Massachusetts','MI':'Michigan','MN':'Minnesota',
             'MS':'Mississippi','MO':'Missouri','MT':'Montana','NE':'Nebraska','NV':'Nevada',
             'NH':'New Hampshire','NJ':'New Jersey','NM':'New Mexico','NY':'New York',
             'NC':'North Carolina','ND':'North Dakota','OH':'Ohio','OK':'Oklahoma',
             'OR':'Oregon','PA':'Pennsylvania','RI':'Rhode Island','SC':'South Carolina',
             'SD':'South Dakota','TN':'Tennessee','TX':'Texas','UT':'Utah','VT':'Vermont',
             'VA':'Virginia','WA':'Washington','WV':'West Virginia','WI':'Wisconsin','WY':'Wyoming'}

us_states_rev = {v:k for k,v in us_states.items()}
canada_states = {'AB':'Alberta', 'BC': 'British Columbia', 'MB': 'Manitoba', 'NB': 'New Brunswick', 
                 'NL': 'Newfoundland', 'NT': 'Northwest Territories', 'NS': 'Nova Scotia', 'NU': 'Nunavut', 
                 'ON': 'Ontario', 'PE': 'Prince Edward Island', 'QC': 'Quebec', 'SK': 'Saskatchewan', 
                 'YT': 'Yukon'}

canada_states_rev = {v:k for k,v in canada_states.items()}
australia_states = ['Australian Capital Territory', 'New South Wales', 'Northern Territory', 
                    'Queensland', 'South Australia', 'Tasmania', 'Victoria', 'Western Australia']

* Process loc info in loc_dict(users)

In [None]:
# set up a city: country dictionary
# cities_map: key is city name, value is dicts with country name as key and 1 as value(event)
cities_map = {}
for country, cities_set in cities.items():
    for city in cities_set:
        if city not in cities_map:
            cities_map[city] = {}
        cities_map[city][country] = cities_map[city].get(country, 0) + 1

# process user location tokens
count = 0
for uid, udict in loc_dict.items():
    if 'tokens' not in udict:
        continue
    tokens = udict['tokens']
    country = None
    state = None
    city = None
    
    # find the country
    for t in tokens:
        if t in countries:
            country = t
            break
            
    # find the state
    for t in tokens:
        if t in us_states:
            state = t
            country = 'United States'
        if t in us_states_rev:
            state = us_states_rev[t]
            country = 'United States'
        if t in canada_states:
            country = 'Canada'
            state = t
        if t in canada_states_rev:
            country = 'Canada'
            state = canada_states_rev[t]
        if t in australia_states:
            state = t
            country = 'Australia'
    
    # if country is found, use cities dict to find the city
    if country:
        for t in tokens:
            if t in cities[country]:
                city = t
                
    # if neither of country and state is found, try to find the city
    if not (city or state or country):
        country_set = set(countries)
        for t in tokens:
            if t in cities_map:
                
                to_remove = ['United States', 'Canada', 'Australia']
                i = 0
                
                # cities_map[t] > 1 means there is more than one country have city named t
                # remove these three countries because there are same city names in different states
                while len(cities_map[t]) > 1 and i < 3:
                    if to_remove[i] in cities_map[t]:
                        cities_map[t].pop(to_remove[i])
                    i += 1
                    
                if len(cities_map[t]) == 1:
                    city = t
                    country = list(cities_map[t].keys())[0]
                elif len(cities_map[t]) > 1:
                    
                    # removes the items that is not present in both sets
                    country_set.intersection_update(cities_map[t])
                
        # if city is found in more than one countries in cities_map
        if not country and len(country_set) < 5:
            country = list(country_set)
    
    # if neither of them is found
    if not (city or state or country):
        country = tokens
    if state:
        udict['state'] = state
    if country:
        udict['country'] = country
    if city:
        udict['city'] = city

In [None]:
count = 0
for uid, udict in loc_dict.items():
    state = udict.get('state')
    country = udict.get('country')
    city = udict.get('city')
    
    # if there is only one country, convert str to list
    if isinstance(country, str):
        country = [country]
    locations = []
    
    for c in country:
        l = {'country': c}
        if state:
            l['state'] = state
        if city:
            l['city'] = city
        locations.append(l)
    udict['locations'] = locations
    

* Update user_info connection in DB

for uid, udict in loc_dict.items():
    user_info.update_one({'id': uid},{'$set': {'loc': udict['locations']}}, upsert = True)

events = pd.read_csv("/home/ruifan/Downloads/event-recommendation-engine-challenge/events.csv", 
                     usecols = ['event_id','start_time'])

event_info.create_index('eid', unique = True)
for e in events.iterrows():
        e = e[1]
        t = e['start_time']
        t = parse(t)
        t = time.mktime(t.timetuple())
        event_info.update_one({'eid': e['event_id']},{'$set': {'start': t}})

* Update event_info connection in DB

event = {}
count = 0
for eid in event_location.keys():
    filtered = set()
    locs = event_location.get(eid, [])
    for loc in locs:
        
        # standardize state name
        state = us_states_rev.get(loc[2], loc[2])
        state = canada_states_rev.get(state, state)
        filtered.add((loc[0], loc[1], state))
    filtered = [{'city': l[0], 'country': l[1], 'state': l[2]} for l in filtered]
    event = {'eid': eid,'loc': filtered}
    event_info.insert_one(event)


events = pd.read_csv("/home/ruifan/Downloads/event-recommendation-engine-challenge/events.csv", 
                     usecols = ['event_id','user_id'])

for e in events.iterrows():
        e = e[1]
        creator = int(e['user_id'])
        event_info.update_one({'eid': int(e['event_id'])},{'$set': {'creator': creator}})

* insert users geoloc into mongo

''' 
using external API to convert address to geolocation

'''

from geopy.geocoders import Nominatim

locator = Nominatim(user_agent = "kkcc_getgeoloc")

def address_to_geolocation(loc):
    getloc = locator.geocode(loc)
    
    return [getloc.latitude, getloc.longitude]

In [None]:
len(event_location)

In [None]:
event_ids = list(event_location.keys())

In [None]:
len(loc_dict)

In [None]:
user_ids = list(loc_dict.keys())

In [None]:
creator_ids = set()
for e in event_info.find():
    c_id = e.get('creator',[])
    creator_ids.add(c_id)
len(creator_ids)

In [None]:
def Intersection(lst1, lst2):
    return set(lst1).intersection(lst2)

creator_ids = list(creator_ids)
its = Intersection(user_ids,creator_ids)
len(its)

In [None]:
event_attendees= pd.read_csv('/home/ruifan/Downloads/event-recommendation-engine-challenge/event_attendees.csv')
att_eids = []
att_uids = []
for event in event_attendees.iterrows():
    event = event[1]
    if isinstance(event['yes'],float):
        continue
    users = event['yes']
    uids = [int(u) for u in users.split()]
    eid = event['event']
    att_eids.append(eid)
    att_uids.extend(uids)

In [None]:
location_cache = {}
event_chunks = pd.read_csv('/home/ruifan/Downloads/event-recommendation-engine-challenge/events.csv',
                          usecols = ['city','state','country','lat','lng'])

In [None]:
event_chunks.isna().sum()

* build a dict of spatial info
* location_cache: key is address, value is geolocation(coordinates)

count = 0
for e in event_chunks.iterrows():
    count += 1
    if count % 100000 == 0:
        print(count)
    
    e = e[1]

    
    if (e['city'] or e['state'] or e['country']) \
    and ((e['lng'] and not isnan(e['lat'])) and (e['lat'] and not isnan(e['lng']))):
        location = [e['lat'], e['lng']]
        loc_string = '%s, %s, %s' % (e['city'], e['state'], e['country'])
        location_cache[loc_string] = location   

In [None]:
len(location_cache)

* insert geolocation info into event_info

events_chunks = pd.read_csv("/home/ruifan/Downloads/event-recommendation-engine-challenge/events.csv", 
                            iterator=True, chunksize=10000)
count = 0
for chunk in events_chunks:
    for e in chunk.iterrows():
        count += 1
        if count % 100000 == 0:
            print(count)
        
        e = e[1]
        eid = e['event_id']
        location = None
        
        # if lat and lng are not null
        if e['lat'] and e['lng'] and not isnan(e['lat']) and not isnan(e['lng']):
            location = [e['lat'], e['lng']]
            
        # if lat or lng is null
        elif e['city'] or e['state'] or e['country']:
            loc_string = '%s, %s, %s' % (e['city'], e['state'], e['country'])
            
            if loc_string in location_cache:
                location = location_cache[loc_string]
        if location:
            words = list(e[9:110])
            event_info.update_one({'eid':eid},{'$set': {'location': location, 'words': words}})

* insert geoloc info into user_info

count = 0
for user in user_info.find():
    count += 1
    if count % 10000 == 0:
        print(count)
        
    if user.get('loc',[]):
        loc = user.get('loc',[])[0]
    if not user.get('loc',[]):
        loc  = None
    nan = float('nan')
    location = None
    if loc:
        loc_string = '%s, %s, %s' % (loc.get('city',nan), loc.get('state', nan),loc.get('country', nan))
    if loc_string in location_cache:
        location = location_cache[loc_string]
        user_info.update_one({'id': user.get('id')}, {'$set': {'location': location}})

* insert age into user_info

for u in user_info.find():
    a = u['birth']
    try:
        a = int(a)
        if a < 1940:
            a = None
        else:
            a = 2013 - a
    except:
        a = None
    user_info.update_one({'id': u['id']},{'$set': {'age': a}})

* insert mean age of users who attended this event into event_info

In [None]:
user_dict = list(user_info.find())
user_dict = {u['id']:u for u in user_dict}

In [None]:
import numpy as np
count = 0
mean_ages = []
for e in event_info.find():
    attends = attendance.find({'eid': e['eid']})
    ages = []
    for at in attends:
        if 'yes' not in at:
            continue
        uid = at['uid']
        if uid in user_dict:
            a = user_dict[uid]['age']
            if a:
                ages.append(a)
    
    if ages:
        mean_age = np.mean(ages)
        event_info.update_one({'eid': e['eid']},{'$set': {'mean_age': mean_age}})

## 1.4 Process user locale and timezone info

In [None]:
# user_data: key is user id, value is a dict with locale, tz(could be {}) and loc info(not null)
# loc_dict only records users that loc value is not null

# inplace assignment, dict is mutable
user_data = loc_dict

# insert items to user_data dict
user_info_file = pd.read_csv('/home/ruifan/Downloads/event-recommendation-engine-challenge/users.csv')
for user in user_info_file.iterrows():
    user = user[1]
    uid = int(user['user_id'])
    d = {}
    if 'locale' in user and not isinstance(user['locale'], float):
        d['locale'] = user['locale']
    if 'timezone' in user and not isnan(user['timezone']):
        d['tz'] = user['timezone']
    if uid not in user_data:
        user_data[uid] = {}
    user_data[uid].update(d)
    

for user in user_info.find():
    user_data[user['id']]['locations'] = user.get('loc', [])

* insert users birth and genter info

for user in user_info_file.iterrows():
    user = user[1]
    uid = int(user['user_id'])
    user_info.update_one({'id':uid},{'$set':{'birth': user['birthyear'], 'gender':user['gender']}}, upsert=True)

In [None]:
# vote_locale: key is locale, value is a dict with countries as key and the corresponding counts of users as value
vote_locale = {}
countries = set([])
for user in user_data.values():
    
    # if locale and locations are not null
    if 'locale' in user and 'locations' in user:
        locale = user['locale']
        if locale not in vote_locale:
            vote_locale[locale] = {}
        for l in user['locations']:
            country = l['country']
            countries.add(country)
            vote_locale[locale][country] = vote_locale[locale].get(country, 0) + 1

In [None]:
locales = {'es_NI': 'Nicaragua', 'tr_TR': 'Turkey', 'en_SG': 'Singapore', 'th_TH': 'Thailand', 
           'es_VE': 'Venezuela', 'hu_HU': 'Hungary', 'es_AR': 'Argentina', 'ar_EG': 'Egypt', 
           'is_IS': 'Iceland', 'zh_HK': 'Hong Kong', 'de_AT': 'Austria', 'pt_BR': 'Brazil', 
           'cs_CZ': 'Czech Republic', 'sk_SK': 'Slovakia', 'mk_MK': 'Macedonia', 'ar_MA': 'Morocco', 
           'en_ZA': 'South Africa', 'sv_SE': 'Sweden', 'in_ID': 'Indonesia', 'es_PR': 'Puerto Rico', 
           'sr_ME': 'Montenegro', 'fr_FR': 'France', 'fi_FI': 'Finland', 'et_EE': 'Estonia', 'sr_RS': 
           'Serbia', 'es_PY': 'Paraguay', 'no_NO': 'Norway', 'nl_NL': 'Netherlands', 'es_PE': 'Peru', 
           'lv_LV': 'Latvia', 'es_PA': 'Panama', 'el_CY': 'Cyprus', 'ro_RO': 'Romania', 
           'iw_IL': 'Israel', 'es_CO': 'Colombia', 'es_CL': 'Chile', 'es_CR': 'Costa Rica', 
           'hr_HR': 'Croatia', 'ru_RU': 'Russia', 'da_DK': 'Denmark', 'ar_LB': 'Lebanon', 
           'sq_AL': 'Albania', 'ms_MY': 'Malaysia', 'ar_OM': 'Oman', 'es_HN': 'Honduras', 
           'pt_PT': 'Portugal', 'vi_VN': 'Vietnam', 'en_NZ': 'New Zealand', 'ar_YE': 'Yemen', 
           'ar_SD': 'Sudan', 'be_BY': 'Belarus', 'sr_CS': 'Serbia and Montenegro', 'ar_BH': 'Bahrain', 
           'ar_JO': 'Jordan', 'es_EC': 'Ecuador', 'hi_IN': 'India', 'ja_JP': 'Japan', 
           'lt_LT': 'Lithuania', 'sl_SI': 'Slovenia', 'es_ES': 'Spain', 'en_GB': 'United Kingdom', 
           'bg_BG': 'Bulgaria', 'es_SV': 'El Salvador', 'zh_TW': 'Taiwan', 'sr_BA': 'Bosnia and Herzegovina', 
           'ar_AE': 'United Arab Emirates', 'es_BO': 'Bolivia', 'zh_CN': 'China', 
           'it_CH': 'Switzerland', 'ar_IQ': 'Iraq', 'ar_QA': 'Qatar', 'ar_SA': 'Saudi Arabia', 'ar_LY': 'Libya', 
           'it_IT': 'Italy', 'uk_UA': 'Ukraine', 'el_GR': 'Greece', 'ar_SY': 'Syria', 'fr_BE': 'Belgium', 
           'ar_DZ': 'Algeria', 'ga_IE': 'Ireland', 'es_GT': 'Guatemala', 'en_AU': 'Australia', 
           'ar_TN': 'Tunisia', 'es_UY': 'Uruguay', 'en_PH': 'Philippines', 'mt_MT': 'Malta', 
           'es_US': 'United States', 'ko_KR': 'South Korea', 'de_LU': 'Luxembourg', 'de_DE': 'Germany', 
           'es_MX': 'Mexico', 'fr_CA': 'Canada', 'es_DO': 'Dominican Republic', 'pl_PL': 'Poland', 
           'ar_KW': 'Kuwait'}

locales.update({'af_ZA': 'South Africa','cy_GB': 'United Kingdom','bn_IN': 'India','ca_ES': 'Spain',
                'az_AZ': 'Azerbaijan', 'id_ID': 'Indonesia', 'ka_GE': 'Georgia', 'km_KH': 'Cambodia', 
                'pa_IN': 'India', 'ku_TR': 'Turkey', 'en_IN': 'India', 'he_IL': 'Israel', 
                'bs_BA': 'Bosnia and Herzegovina', 'fa_IR': 'Iran', 'mn_MN': 'Mongolia', 'tl_PH': 'Philippines',
                'nb_NO': 'Norway', 'jv_ID': 'Indonesia'})

In [None]:
# impute missing location info based on locale info
for uid, user in user_data.items():
    if 'locale' not in user:
        continue
    if user['locale'] not in locales:
        continue
        
    # country based on locale
    locale_country = locales[user['locale']]
    locs = user.get('locations', [])
    sw = 0
    for loc in locs:
        
        # if county in loc dict is the same as country based on locale 
        if loc['country'] == locale_country:
            sw = 1
    
    # if location key does not exist 
    # or the country in loc dict is different from the country based on locale
    # replace the country with locale_country
    if sw == 0:
        locs.append({'country': locale_country})
        user_data[uid]['locations'] = locs

In [None]:
# count of users without loc info : 5032
count = 0
for user in user_data.values():
    if 'locations' not in user or not user['locations']:
        count += 1

In [None]:
# timezones: key is tz, value is a dict with countries as key and the corresponding counts of users as value
count = 0
timezones = {}
for user in user_data.values():
    if 'tz' not in user or 'locations' not in user or not user['locations']:
        count += 1
        continue
    if user['tz'] not in timezones:
        timezones[user['tz']] = {}
    tz = timezones[user['tz']]
    for loc in user['locations']:
        tz[loc['country']] = tz.get(loc['country'], 0) + 1

In [None]:
# prune timezone countries
# leave countries with value great than sum/5
for tz in timezones.keys():
    countries = timezones[tz]
    countries = sorted(countries.items(), key=lambda x: -x[1])
    s = sum([c[1] for c in countries])
    if s < 5:
        timezones[tz] = {}
        continue
    i = 1
    while i < len(countries) and countries[i][1] > s / 5.0:
        i += 1
    countries = dict(countries[:i])
    timezones[tz] = countries
    

## 1.5 Process friends info

In [None]:
# insert friends info into db
friends_file = pd.read_csv('/home/ruifan/Downloads/event-recommendation-engine-challenge/user_friends.csv')
friends_dict = {}

# create friends data in memory
# friends_dict: 
for record in friends_file.iterrows():
    record = record[1]
    uid1 = record['user']
    if uid1 not in friends_dict:
        friends_dict[uid1] = []
    friends = record['friends']
    if isinstance(friends, float):
        continue
    friends = [int(u) for u in record['friends'].split()]
    for uid2 in friends:
        friends_dict[uid1].append(uid2)

'''
for uid, friends in friends_dict.items():
    record = {'uid': uid,'friends': list(friends)}
    user_friends.insert_one(record)
'''

In [None]:
# set.intersection return intersection(set) of two sets or a list and a set
# only leave friends who have user_id
# friends: key is uid, value is a set of friends_id who also have user_id
friends = {}
user_ids = set(user_data.keys())
for f in user_friends.find():
    friends[f['uid']] = user_ids.intersection(f['friends'])

In [None]:
count = 0
for uid in user_ids:
    
    # collect locations from friends
    # friend_vote: key is location info in form '%s-%s-%s', value is a dict with location and vote as keys
    # e.g. [('Indonesia-None-None', {'location': {'country': 'Indonesia'}, 'vote': 1})]
    friend_votes = {}
    for fid in friends.get(uid, []):
        if 'locations' not in user_data[fid]:
            continue
        
        # get loc info of fid
        flocs = user_data[fid]['locations']
        for loc in flocs:
            key = '%s-%s-%s' % (loc.get('country'),loc.get('state'),loc.get('city')) 
            if key not in friend_votes:
                
                # shallow copy the loc info from loc info in user_data
                friend_votes[key] = {'location': copy.copy(loc),'vote': 1}
            else:
                friend_votes[key]['vote'] += 1
    # filter
    friend_votes = friend_votes.values()
    s = sum([x['vote'] for x in friend_votes])
    
    # only leave locations with counts more than s/5
    friend_votes = [f for f in friend_votes if f['vote'] > 1 and f['vote'] > s/5]
    if not friend_votes:
        continue
    
    # merge with user's locations
    if 'locations' not in user_data[uid]:
        user_data[uid]['locations'] = []
    for floc in friend_votes:
        sw = 0
        floc = floc['location']
        
        # if uloc is the same as floc
        for uloc in user_data[uid]['locations']:
            if floc['country'] == uloc['country'] and \
                    floc.get('state') ==  uloc.get('state') and \
                    floc.get('city') ==  uloc.get('city'):
                sw = 1
                
        # if uloc is different from floc, append floc to user_data[uid]['locations']
        # count = 441
        if sw == 0:
            user_data[uid]['locations'].append(floc)

* newloc consists of user loc and friends loc
for uid, data in user_data.items():
    user_info.update_one({'id': uid}, {'$set': {'newloc': data['locations']}})

## 1.6 Process attendance data

In [None]:
# Create attendance dictionaries Provided in both directions
def insert_attendance(uid, eid, att_type):
    attendance.insert_one(
        {'uid': uid, 'eid': eid, att_type: True})

def update_attendance(uid, eid, att_type):
    attendance.update_one(
        {'uid': uid, 'eid': eid},
        {'$set': {'uid': uid, 'eid': eid, att_type: True}}, upsert=True)

* insert attendance info into db

events = pd.read_csv('/home/ruifan/Downloads/event-recommendation-engine-challenge/events.csv',
                     usecols = ['event_id','user_id'])
for e in events.iterrows():
    e = e[1]
    eid = int(e['event_id'])
    uid = int(e['user_id'])
    
    attendance.insert_one({'uid': uid, 'eid': eid, 'yes': True, 'interested': True})

* insert train and event_attendees data into attendance collection (befrore update: 3137972  after update: 3143114)

attendance.create_index([('uid',1),('eid',1)], unique = True) 
from dateutil.parser import parse
train = pd.read_csv( "/home/ruifan/Downloads/event-recommendation-engine-challenge/train.csv",  
                    converters = {"timestamp": parse})

for pair in train.iterrows():
    pair = pair[1]
    uid = pair['user']
    eid = pair['event']
    for attr in ['invited', 'interested', 'not_interested']:
        if pair[attr]:
            update_attendance(uid, eid, attr)

* Process event_attendees file to update attendance collection

event_attendees = pd.read_csv("/home/ruifan/Downloads/event-recommendation-engine-challenge/event_attendees.csv")
for event in event_attendees.iterrows():
    event = event[1]
    eid = event['event']
    for attr in ['yes', 'maybe', 'invited', 'no']:
        users = event[attr]
        if isinstance(users, float):
            continue
        users = [int(u) for u in users.split()]
        for uid in users:
            update_attendance(uid, eid, attr)