## Importing the necessary packages

In [187]:
import pandas as pd
import datetime
import warnings
import numpy as np
import matplotlib.pyplot as plt
import math

## Function to open the csv.

We drop a certain number of columns (often geographical data because not necessary for the analysis). You can also drop rows where data is missing (especially when the nationality is not filled in).

In [188]:
def opening(name):
    df = pd.read_csv(name, sep = ',')
    df = df.drop(columns = ['Unnamed: 0', 'gid_0', 'name_0', 'gid_1','name_1','gid_2', 'name_2', 'gid_3' ,'name_3', 'gid_4', 'name_4', 'gid_5', 'name_5', 'idplace'])
    df['date_review'] = df['date_review'].replace('0000-00-00', np.nan)
    df = df.dropna()
    df['date_review'] = pd.to_datetime(df['date_review'], format = '%Y-%m-%d')
    df = df[df.country != '-']
    return df

In [189]:
df = opening("../sample_data.csv")
df.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,"Les Voiles, France",48880596,2293592,290314837,7724A40D7C566C125D6B88F8D481C4E1,2015-07-19,France
1,Pont des Arts,48858337,2337505,287451636,1DDFC1D5B95914F3EB95F4F3B8E14244,2015-07-09,Brazil
2,"Le Pario, France",48846455,2283529,270727530,243CB7D159E407D5CA591ECDDD0ABF08,2015-05-08,France
3,"Pierre sang In Oberkampf, France",48864666,2372361,275010751,5F2118B1EE8355F6E2667D310A57FBAD,2015-05-27,France
4,"Angelina Paris, France",48865074,2328488,317345160,40945E906CA6C96440A58769CA0D3760,2015-10-09,France


### Number of different locations

In [190]:
differentLocations = len(list(set(df['nom'])))
differentLocations

3891

### Total number of reviews

In [191]:
numberReviews = len(df)
numberReviews

8471

## Selection of the year for the analysis

In [192]:
def year_selection(annee, df):
    df = df[(df['date_review'].dt.year > annee - 1) & (df['date_review'].dt.year < annee + 1)]
    return df

In [193]:
df2 = year_selection(2015, df)
df2.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,"Les Voiles, France",48880596,2293592,290314837,7724A40D7C566C125D6B88F8D481C4E1,2015-07-19,France
1,Pont des Arts,48858337,2337505,287451636,1DDFC1D5B95914F3EB95F4F3B8E14244,2015-07-09,Brazil
2,"Le Pario, France",48846455,2283529,270727530,243CB7D159E407D5CA591ECDDD0ABF08,2015-05-08,France
3,"Pierre sang In Oberkampf, France",48864666,2372361,275010751,5F2118B1EE8355F6E2667D310A57FBAD,2015-05-27,France
4,"Angelina Paris, France",48865074,2328488,317345160,40945E906CA6C96440A58769CA0D3760,2015-10-09,France


In [194]:
df2.shape

(8471, 7)

## Selection of the number of places to be treated (the top k first)

To do so, we group the name of the place, to which we apply the count function of the unique number of reviews (represented by the id column). The result is then sorted in order to have the top of the represented places.

In [195]:
def top_k(k, df):
    top_k = df.groupby('nom').count()['id'].sort_values(ascending = False)[0:k]
    return top_k

In [196]:
tp_k = top_k(10, df)
tp_k

nom
Musée du Louvre                          232
Eiffel Tower                             224
Cathédrale Notre-Dame de Paris           112
Arc de Triomphe                          109
Jardin du Luxembourg                      95
Opéra Garnier                             56
La Seine                                  55
Basilique du Sacré-Cœur de Montmartre     55
Musée d'Orsay                             54
Pont Alexandre-III                        46
Name: id, dtype: int64

## We can now reduce the DataFrame to the top k of locations 

In [197]:
def to_keep(df, top_k):
    keep = pd.DataFrame(columns = df.columns)
    for places in top_k.index:
        keep = keep.append(df[df['nom'] == places])
    return keep

In [198]:
df3 = to_keep(df, tp_k).reset_index(drop = True)
df3.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,Musée du Louvre,48861,2335833,257516122,ECD857D6120B8C7E6CD11C13597D0E35,2015-03-03,United States
1,Musée du Louvre,48861,2335833,262132208,9D91051F1F8CBF353A383833CA57B4CF,2015-03-27,France
2,Musée du Louvre,48861,2335833,284360586,B41FAD4E6B32B1044AD40B41A71EA1C6,2015-06-30,Venezuela
3,Musée du Louvre,48861,2335833,274336334,2BF7CF57E6EBA327A3351235F90AA454,2015-05-25,Germany
4,Musée du Louvre,48861,2335833,295583186,2959E31C67D07A5DD24D33B67CB49CD6,2015-08-04,United States


In [199]:
df3.shape

(1038, 7)

### We have the percentage that the top_K represents in relation to the totality of the reviews of the chosen year.

In [200]:
reductionRate = len(df3.index)/len(df2.index)
reductionRate

0.12253571006964939

## In order to create the sequences, for the sake of clarity and processing, we're going to assign a number to each location. Then reduce it to 1 digit via the ascii column.

The Alergia algorithm can be modified later so that it can take into account the double digits.

In [201]:
df3['Group_ID'] = df3.groupby('nom').grouper.group_info[0]
df3['ascii'] = df3.Group_ID.apply(lambda x: chr(x+48))
df3.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country,Group_ID,ascii
0,Musée du Louvre,48861,2335833,257516122,ECD857D6120B8C7E6CD11C13597D0E35,2015-03-03,United States,7,7
1,Musée du Louvre,48861,2335833,262132208,9D91051F1F8CBF353A383833CA57B4CF,2015-03-27,France,7,7
2,Musée du Louvre,48861,2335833,284360586,B41FAD4E6B32B1044AD40B41A71EA1C6,2015-06-30,Venezuela,7,7
3,Musée du Louvre,48861,2335833,274336334,2BF7CF57E6EBA327A3351235F90AA454,2015-05-25,Germany,7,7
4,Musée du Louvre,48861,2335833,295583186,2959E31C67D07A5DD24D33B67CB49CD6,2015-08-04,United States,7,7


In [202]:
def asciiToName(df):
    testdata = df[['ascii', 'nom']].values
    return dict(list(set(tuple(x) for x in testdata)))

In [203]:
asciiToName(df3)

{'2': 'Cathédrale Notre-Dame de Paris',
 '1': 'Basilique du Sacré-Cœur de Montmartre',
 '3': 'Eiffel Tower',
 '0': 'Arc de Triomphe',
 '9': 'Pont Alexandre-III',
 '8': 'Opéra Garnier',
 '5': 'La Seine',
 '7': 'Musée du Louvre',
 '4': 'Jardin du Luxembourg',
 '6': "Musée d'Orsay"}

## We now move on to the creation of sequences for each user. The particularity is that we are going to separate the sequences according to the number of days between 2 photos.

We start by grouping the places in chronological order according to the author's id. In other words, we are going to put in a list the reviewed places with their respective date of publication for each user. 
Then, we will compare the time delta between the review i and the i + 1. If it is less than or equal to the desired parameter, we add the element to the sequence *a* . Otherwise, we end the sequence, and separate the two reviews into 2 distinct sequences *a* and *b*.

In [204]:
def create_sequencesv2(df, threshold_days):
    df2 = df.sort_values('date_review', ascending = True).groupby('idauteur').apply(lambda x: x[['ascii','date_review']].values.tolist()).reset_index(name='col')
    
    df2['len'] = df2['col'].map(len)
    df2 = df2[df2['len'] >= 2] 

    sequences_finales = [] #used to split patterns according to the threshold of days we want between 2 days
            
    for element in df2['col']:
        sublist = ''
        sublist2 = ''
        for i in range(1, len(element)):
            
            diff = abs(element[i-1][1] - element[i][1]).days
            
            if(diff <= threshold_days): #IF THE PICTURE AND THE PREVIOUS ONE HAVE LESS THAN X DAYS 
                
                if(i == 1): #if at the beginning of the sequence
                    sublist += element[i-1][0] #we add the first element of the sequence
        
                sublist += element[i][0] #if not, we add the current one to the sublist
                   
            else: #IF THERE IS MORE THAN X DAYS :
                
                if(len(sublist) != 0): #if we stop a existing sequence, we add it
                    sequences_finales.append(sublist)  
                    sublist = ''
                
                if(i == 1): #if at the beginning of the sequence
                    sublist += element[i-1][0] #we add the first one
                    sequences_finales.append(sublist)  #then we cut the sub-sequence
                    sublist ='' #and we reset the sub-sequence
                    
                sublist += element[i][0] #if not, we create another sub-sequence
                
                    
        if(len(sublist) != 0): #at the end, we add the subsequence
            sequences_finales.append(sublist)
                    
    lieux = df['nom'].unique() 
    
    return lieux, sequences_finales

In [205]:
lieux, sequences = create_sequencesv2(df3, 7) #We do use 7 days maximum between 2 reviews

### Here are the first 5 sequences cut out according to the number of days between 2 photos. However, we can see that the 5th sequence is size 1. So we have to keep only the sizes > 2.

In [206]:
len(sequences)

21

In [207]:
def keepSize2(sequences):
    result = []
    for s in sequences:
        if len(s) >= 2:
            result.append(s)
    return result

In [208]:
seqSize2 = keepSize2(sequences)

In [209]:
def averageLen(lst):
    lengths = [len(i) for i in lst]
    return 0 if len(lengths) == 0 else (float(sum(lengths)) / len(lengths)) 

### This is the mean length of our sequences: 

In [210]:
averageLen(seqSize2) 

2.0

### Here are the first 5 sequences returned by the algorithm. We do not have access to any user data, only to sequences of items (each symbol represents a place).

### To explain the sequences format:
38 means that a tourist went at item 3 and then item 8; i.e. Eiffel Tower and then Opéra Garnier.

In [213]:
seqSize2[:5]

['38', '93', '37', '03', '37']

In [214]:
with open('sequences.txt', 'w') as f:
    for item in seqSize2:
        f.write("%s\n" % item)