## Importing the necessary packages

In [1]:
import pandas as pd
import datetime
import warnings
import numpy as np
import matplotlib.pyplot as plt
import math

## Function to open the csv.

We drop a certain number of columns (often geographical data because not necessary for the analysis). You can also drop rows where data is missing (especially when the nationality is not filled in).

In [2]:
def opening(name):
    df = pd.read_csv(name, sep = ',')
    df = df.drop(columns = ['Unnamed: 0', 'gid_0', 'name_0', 'gid_1','name_1','gid_2', 'name_2', 'gid_3' ,'name_3', 'gid_4', 'name_4', 'gid_5', 'name_5', 'idplace'])
    df['date_review'] = df['date_review'].replace('0000-00-00', np.nan)
    df = df.dropna()
    df['date_review'] = pd.to_datetime(df['date_review'], format = '%Y-%m-%d')
    df = df[df.country != '-']
    return df

In [3]:
df = opening("../sample_data.csv")
df.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,"L'Insoumise, France",48835045,228915,328246480,AEC731BAB7B8CCA02944C293198E3FC6,2015-11-21,Italy
1,Cinq Mondes Paris,48871681,232899,331590265,26ABC72A68434467E4D20CEC4F1B3DC3,2015-12-09,France
2,"L'Envie, France",4883414,2317883,262837346,36B0E60FD21BFE2377C526209C3FF4D5,2015-03-31,France
3,"Le Ciel de Paris, France",4884219,232198,266833774,72C94635808B0207A5A664DD646F3744,2015-04-20,France
4,"Cafe Les Deux Magots, France",48853973,2333158,304112023,F007B2D8152512B20BB4EB3E7C5ACA2C,2015-08-27,Brazil


### Number of different locations

In [4]:
differentLocations = len(list(set(df['nom'])))
differentLocations

9420

### Total number of reviews

In [5]:
numberReviews = len(df)
numberReviews

56264

## Selection of the year for the analysis

In [6]:
def year_selection(annee, df):
    df = df[(df['date_review'].dt.year > annee - 1) & (df['date_review'].dt.year < annee + 1)]
    return df

In [7]:
df2 = year_selection(2015, df)
df2.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,"L'Insoumise, France",48835045,228915,328246480,AEC731BAB7B8CCA02944C293198E3FC6,2015-11-21,Italy
1,Cinq Mondes Paris,48871681,232899,331590265,26ABC72A68434467E4D20CEC4F1B3DC3,2015-12-09,France
2,"L'Envie, France",4883414,2317883,262837346,36B0E60FD21BFE2377C526209C3FF4D5,2015-03-31,France
3,"Le Ciel de Paris, France",4884219,232198,266833774,72C94635808B0207A5A664DD646F3744,2015-04-20,France
4,"Cafe Les Deux Magots, France",48853973,2333158,304112023,F007B2D8152512B20BB4EB3E7C5ACA2C,2015-08-27,Brazil


In [8]:
df2.shape

(56264, 7)

## Selection of the number of places to be treated (the top k first)

To do so, we group the name of the place, to which we apply the count function of the unique number of reviews (represented by the id column). The result is then sorted in order to have the top of the represented places.

In [9]:
def top_k(k, df):
    top_k = df.groupby('nom').count()['id'].sort_values(ascending = False)[0:k]
    return top_k

In [10]:
tp_k = top_k(10, df)
tp_k

nom
Musée du Louvre                          1549
Eiffel Tower                             1516
Cathédrale Notre-Dame de Paris            749
Arc de Triomphe                           711
Jardin du Luxembourg                      589
La Seine                                  478
Musée d'Orsay                             413
Basilique du Sacré-Cœur de Montmartre     367
Champs-Élysées                            348
Opéra Garnier                             333
Name: id, dtype: int64

## We can now reduce the DataFrame to the top k of locations 

In [11]:
def to_keep(df, top_k):
    keep = pd.DataFrame(columns = df.columns)
    for places in top_k.index:
        keep = keep.append(df[df['nom'] == places])
    return keep

In [12]:
df3 = to_keep(df, tp_k).reset_index(drop = True)
df3.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,Musée du Louvre,48861,2335833,311501513,4B018C46529B92190134DEBD3D429BCD,2015-09-18,Italy
1,Musée du Louvre,48861,2335833,288015394,294FB3D2C00277C2A2F563F2D4DA7A55,2015-07-11,Ireland
2,Musée du Louvre,48861,2335833,316627370,39AE30708C84823D0A6DEFB86D7D2370,2015-10-06,Argentina
3,Musée du Louvre,48861,2335833,277869091,2D508690DD4E5281BF88B235E1121A2C,2015-06-04,Turkey
4,Musée du Louvre,48861,2335833,249606817,F6883072C58471A6A04D7FBFB9608DA2,2015-01-16,Brazil


In [13]:
df3.shape

(7053, 7)

### We have the percentage that the top_K represents in relation to the totality of the reviews of the chosen year.

In [14]:
reductionRate = len(df3.index)/len(df2.index)
reductionRate

0.12535546708374803

## In order to create the sequences, for the sake of clarity and processing, we're going to assign a number to each location. Then reduce it to 1 digit via the ascii column.

The Alergia algorithm can be modified later so that it can take into account the double digits.

In [15]:
df3['Group_ID'] = df3.groupby('nom').grouper.group_info[0]
df3['ascii'] = df3.Group_ID.apply(lambda x: chr(x+48))
df3.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country,Group_ID,ascii
0,Musée du Louvre,48861,2335833,311501513,4B018C46529B92190134DEBD3D429BCD,2015-09-18,Italy,8,8
1,Musée du Louvre,48861,2335833,288015394,294FB3D2C00277C2A2F563F2D4DA7A55,2015-07-11,Ireland,8,8
2,Musée du Louvre,48861,2335833,316627370,39AE30708C84823D0A6DEFB86D7D2370,2015-10-06,Argentina,8,8
3,Musée du Louvre,48861,2335833,277869091,2D508690DD4E5281BF88B235E1121A2C,2015-06-04,Turkey,8,8
4,Musée du Louvre,48861,2335833,249606817,F6883072C58471A6A04D7FBFB9608DA2,2015-01-16,Brazil,8,8


In [16]:
def asciiToName(df):
    testdata = df[['ascii', 'nom']].values
    return dict(list(set(tuple(x) for x in testdata)))

In [17]:
asciiToName(df3)

{'5': 'Jardin du Luxembourg',
 '0': 'Arc de Triomphe',
 '4': 'Eiffel Tower',
 '9': 'Opéra Garnier',
 '2': 'Cathédrale Notre-Dame de Paris',
 '1': 'Basilique du Sacré-Cœur de Montmartre',
 '7': "Musée d'Orsay",
 '6': 'La Seine',
 '8': 'Musée du Louvre',
 '3': 'Champs-Élysées'}

## We now move on to the creation of sequences for each user. The particularity is that we are going to separate the sequences according to the number of days between 2 photos.

We start by grouping the places in chronological order according to the author's id. In other words, we are going to put in a list the reviewed places with their respective date of publication for each user. 
Then, we will compare the time delta between the review i and the i + 1. If it is less than or equal to the desired parameter, we add the element to the sequence *a* . Otherwise, we end the sequence, and separate the two reviews into 2 distinct sequences *a* and *b*.

In [18]:
def create_sequencesv2(df, threshold_days):
    df2 = df.sort_values('date_review', ascending = True).groupby('idauteur').apply(lambda x: x[['ascii','date_review']].values.tolist()).reset_index(name='col')
    
    df2['len'] = df2['col'].map(len)
    df2 = df2[df2['len'] >= 2] 

    sequences_finales = [] #used to split patterns according to the threshold of days we want between 2 days
            
    for element in df2['col']:
        sublist = ''
        sublist2 = ''
        for i in range(1, len(element)):
            
            diff = abs(element[i-1][1] - element[i][1]).days
            
            if(diff <= threshold_days): #IF THE PICTURE AND THE PREVIOUS ONE HAVE LESS THAN X DAYS 
                
                if(i == 1): #if at the beginning of the sequence
                    sublist += element[i-1][0] #we add the first element of the sequence
        
                sublist += element[i][0] #if not, we add the current one to the sublist
                   
            else: #IF THERE IS MORE THAN X DAYS :
                
                if(len(sublist) != 0): #if we stop a existing sequence, we add it
                    sequences_finales.append(sublist)  
                    sublist = ''
                
                if(i == 1): #if at the beginning of the sequence
                    sublist += element[i-1][0] #we add the first one
                    sequences_finales.append(sublist)  #then we cut the sub-sequence
                    sublist ='' #and we reset the sub-sequence
                    
                sublist += element[i][0] #if not, we create another sub-sequence
                
                    
        if(len(sublist) != 0): #at the end, we add the subsequence
            sequences_finales.append(sublist)
                    
    lieux = df['nom'].unique() 
    
    return lieux, sequences_finales

In [19]:
lieux, sequences = create_sequencesv2(df3, 7) #We do use 7 days maximum between 2 reviews

### Here are the first 5 sequences cut out according to the number of days between 2 photos. However, we can see that the 5th sequence is size 1. So we have to keep only the sizes > 2.

In [20]:
len(sequences)

790

In [21]:
def keepSize2(sequences):
    result = []
    for s in sequences:
        if len(s) >= 2:
            result.append(s)
    return result

In [22]:
seqSize2 = keepSize2(sequences)

In [23]:
def averageLen(lst):
    lengths = [len(i) for i in lst]
    return 0 if len(lengths) == 0 else (float(sum(lengths)) / len(lengths)) 

### This is the mean length of our sequences: 

In [24]:
averageLen(seqSize2) 

2.0862533692722374

### Here are the first 5 sequences returned by the algorithm. We do not have access to any user data, only to sequences of items (each symbol represents a place).

### To explain the sequences format:
96 means that a tourist went at item 9 and then item 6; i.e. Opéra Garnier and then La Seine.

In [25]:
seqSize2[:5]

['96', '10', '72', '19', '806']

In [26]:
with open('sequences.txt', 'w') as f:
    for item in seqSize2:
        f.write("%s\n" % item)