## Importing the necessary packages

In [1]:
import pandas as pd
import datetime
import warnings
import numpy as np
import matplotlib.pyplot as plt
import math

## Function to open the csv.

We drop a certain number of columns (often geographical data because not necessary for the analysis). You can also drop rows where data is missing (especially when the nationality is not filled in).

In [5]:
def opening(name):
    df = pd.read_csv(name, sep = ';')
    df = df.drop(columns = ['gid_0', 'name_0', 'gid_1','name_1','gid_2', 'name_2', 'gid_3' ,'name_3', 'gid_4', 'name_4', 'gid_5', 'name_5', 'idplace'])
    df['date_review'] = df['date_review'].replace('0000-00-00', np.nan)
    df = df.dropna()
    df['date_review'] = pd.to_datetime(df['date_review'], format = '%Y-%m-%d')
    df = df[df.country != '-']
    return df

In [6]:
df = opening('parisTripAdvisor.csv')
df.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
0,Cirque d'hiver Bouglione,48863281,2367029,360852537,788B1AE23D1FC81E0FF4A49B3C4A2157,2016-04-02,France
1,Cirque d'hiver Bouglione,48863281,2367029,360468722,4255E1F5C307DAFBA22D535AE3A8D2D4,2016-03-31,France
2,Cirque d'hiver Bouglione,48863281,2367029,359792359,E491A2C945CC14E3494A8B9ADE26BB07,2016-03-29,France
4,Cirque d'hiver Bouglione,48863281,2367029,358292121,7865F05D0EFB35546F76C4E56FB02FEF,2016-03-24,France
5,Cirque d'hiver Bouglione,48863281,2367029,355005087,461566E3859745CE0D5CF90401230397,2016-03-13,France


### Number of different locations

In [7]:
len(list(set(df['nom'])))

23458

### Total number of reviews

In [8]:
len(df)

3216637

## Selection of the year for the analysis

In [11]:
def year_selection(annee, df):
    df = df[(df['date_review'].dt.year > annee - 1) & (df['date_review'].dt.year < annee + 1)]
    return df

In [12]:
df2 = year_selection(2015, df)

In [13]:
df2.head()

Unnamed: 0,nom,latitude,longitude,id,idauteur,date_review,country
41,Cirque d'hiver Bouglione,48863281,2367029,336292512,D4F7A4362302309DB909D5C283A81077,2015-12-31,France
42,Cirque d'hiver Bouglione,48863281,2367029,336260205,AB9592F3EA0F772D1907760A6C6EBA18,2015-12-31,France
45,Cirque d'hiver Bouglione,48863281,2367029,335413180,62B54358651EBA0EFFE76922FFBC647C,2015-12-28,France
46,Cirque d'hiver Bouglione,48863281,2367029,335407171,E37C49BA1605152F8554A63B7A5E7807,2015-12-28,France
48,Cirque d'hiver Bouglione,48863281,2367029,334586249,A98DF8823A2741328EA4B2D052C11100,2015-12-23,France


## Selection of the number of places to be treated (the top k first)

To do so, we group the name of the place, to which we apply the count function of the unique number of reviews (represented by the id column). The result is then sorted in order to have the top of the represented places.

In [9]:
def top_k(k, df):
    top_k = df.groupby('nom').count()['id'].sort_values(ascending = False)[0:k]
    return top_k

In [14]:
tp_k = top_k(6, df2)

In [15]:
tp_k

nom
Eiffel Tower                      15226
Musée du Louvre                   15000
Cathédrale Notre-Dame de Paris     7780
Arc de Triomphe                    6929
Jardin du Luxembourg               5860
La Seine                           4673
Name: id, dtype: int64

## We can now reduce the DataFrame to the top k of locations 

In [16]:
def to_keep(df, top_k):
    keep = pd.DataFrame(columns = df.columns)
    for places in top_k.index:
        keep = keep.append(df[df['nom'] == places])
    return keep

In [17]:
df3 = to_keep(df2, tp_k).reset_index()

### We have the percentage that the top_K represents in relation to the totality of the reviews of the chosen year.

In [18]:
len(df3.index)/len(df2.index)

0.09851642798912319

## In order to create the sequences, for the sake of clarity and processing, we're going to assign a number to each location. Then reduce it to 1 digit via the ascii column.

The Alergia algorithm can be modified later so that it can take into account the double digits.

In [19]:
df3['Group_ID'] = df3.groupby('nom').grouper.group_info[0]
df3['ascii'] = df3.Group_ID.apply(lambda x: chr(x+48))
df3.head()

Unnamed: 0,index,nom,latitude,longitude,id,idauteur,date_review,country,Group_ID,ascii
0,3238162,Eiffel Tower,48858353,2294464,336273655,53A823D6A6A185E38D26C8FF282C5594,2015-12-31,Israel,2,2
1,3238163,Eiffel Tower,48858353,2294464,336240024,14D9897C309C9853AD74F8C5739381EE,2015-12-31,France,2,2
2,3238164,Eiffel Tower,48858353,2294464,336228711,2E302BCDA005648A175AB205C71E6C1D,2015-12-31,France,2,2
3,3238165,Eiffel Tower,48858353,2294464,336196997,D98EBFB4F881946F5B2ADB00E36CBF7D,2015-12-31,France,2,2
4,3238166,Eiffel Tower,48858353,2294464,336189517,49343346C346360282FD93D1AFBA488E,2015-12-31,France,2,2


In [20]:
def asciiToName(df):
    testdata = df[['ascii', 'nom']].values
    return dict(list(set(tuple(x) for x in testdata)))

In [21]:
asciiToName(df3)

{'4': 'La Seine',
 '3': 'Jardin du Luxembourg',
 '1': 'Cathédrale Notre-Dame de Paris',
 '2': 'Eiffel Tower',
 '5': 'Musée du Louvre',
 '0': 'Arc de Triomphe'}

## We now move on to the creation of sequences for each user. The particularity is that we are going to separate the sequences according to the number of days between 2 photos.

We start by grouping the places in chronological order according to the author's id. In other words, we are going to put in a list the reviewed places with their respective date of publication for each user. 
Then, we will compare the time delta between the review i and the i + 1. If it is less than or equal to the desired parameter, we add the element to the sequence *a* . Otherwise, we end the sequence, and separate the two reviews into 2 distinct sequences *a* and *b*.

In [22]:
def create_sequencesv2(df, threshold_days):
    df2 = df.sort_values('date_review', ascending = True).groupby('idauteur').apply(lambda x: x[['ascii','date_review']].values.tolist()).reset_index(name='col')
    
    df2['len'] = df2['col'].map(len)
    df2 = df2[df2['len'] >= 2] 

    sequences_finales = [] #used to split patterns according to the threshold of days we want between 2 days
            
    for element in df2['col']:
        sublist = ''
        sublist2 = ''
        for i in range(1, len(element)):
            
            diff = abs(element[i-1][1] - element[i][1]).days
            
            if(diff <= threshold_days): #IF THE PICTURE AND THE PREVIOUS ONE HAVE LESS THAN X DAYS 
                
                if(i == 1): #if at the beginning of the sequence
                    sublist += element[i-1][0] #we add the first element of the sequence
        
                sublist += element[i][0] #if not, we add the current one to the sublist
                   
            else: #IF THERE IS MORE THAN X DAYS :
                
                if(len(sublist) != 0): #if we stop a existing sequence, we add it
                    sequences_finales.append(sublist)  
                    sublist = ''
                
                if(i == 1): #if at the beginning of the sequence
                    sublist += element[i-1][0] #we add the first one
                    sequences_finales.append(sublist)  #then we cut the sub-sequence
                    sublist ='' #and we reset the sub-sequence
                    
                sublist += element[i][0] #if not, we create another sub-sequence
                
                    
        if(len(sublist) != 0): #at the end, we add the subsequence
            sequences_finales.append(sublist)
                    
    lieux = df['nom'].unique() 
    
    return lieux, sequences_finales

In [23]:
lieux, sequences = create_sequencesv2(df3, 7) #We do use 7 days maximum between 2 reviews

### Here are the first 5 sequences cut out according to the number of days between 2 photos. However, we can see that the 5th sequence is size 1. So we have to keep only the sizes > 2.

In [24]:
sequences[:5]

['25', '51', '5240', '203', '1']

In [25]:
def keepSize2(sequences):
    result = []
    for s in sequences:
        if len(s) >= 2:
            result.append(s)
    return result

In [26]:
seqSize2 = keepSize2(sequences)

In [27]:
def averageLen(lst):
    lengths = [len(i) for i in lst]
    return 0 if len(lengths) == 0 else (float(sum(lengths)) / len(lengths)) 

### This is the mean length of our sequences: 

In [29]:
averageLen(seqSize2) 

2.53369366227879

### Here are the first 5 sequences returned by the algorithm. We do not have access to any user data, only to sequences of items (each symbol represents a place).

### To explain the sequences format:
25 means that a tourist went at item 2 and then item 5; i.e. Eiffel Tower and then Louvre's Museum

In [30]:
seqSize2[:5]

['25', '51', '5240', '203', '42035']