In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
from datetime import datetime
import plotly.express as px
from skimage import io
import math
import time

In [None]:
Firday = pd.DataFrame(pd.read_csv('park-movement-Fri-FIXED-2.0.csv'))
Firday["id"] = Firday["id"].astype(str)
Firday = Firday.replace('check-in', 1).replace('movement', 0)
Firday["Timestamp"] = Firday["Timestamp"].map(lambda x: int(time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))))
Firday["visit_count"] = Firday.groupby(["id"])["id"].transform("count")
T = pd.DataFrame(Firday.groupby('id')['Timestamp'].agg(np.ptp)).reset_index()
T["visit_duration"] = T['Timestamp'].map(lambda x:x//60)
Firday_III = Firday.merge(T[["id", "visit_duration"]])
Firday_III["Facility_coordinates"]= Firday_III.apply(lambda row: (row.X,row.Y), axis=1)

In [None]:
# Processing raw data, which will be saved after. But the saved data still needs to be processed

In [None]:
Firday = pd.DataFrame(pd.read_csv('park-movement-Fri_New.csv'))
Firday = Firday.drop(["Unnamed: 0"], axis=1)
Firday["id"] = Firday["id"].astype(str)

In [None]:
Firday.isna().sum()

In [None]:
Firday.info()

In [None]:
px.density_heatmap(Firday, x='X', y='Y')

In [None]:
counts, bins = np.histogram(Firday.Timestamp, bins=range(Firday["Timestamp"].min(), Firday["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

### Criterion One：Relative time difference and relative distance (Euclidean distance)

In [None]:
Group = []
# Families, relatives, couples, or any close small group
Crowd = []
# Groups of people who do not belong to each other in small groups, but have similar behavioral patterns
Single = []
# Tourists who do not belong to the above two groups

for index, ID in enumerate(Firday_III.id.unique()):
    D = Firday_III.loc[Firday_III['id'] == ID] 
    Trajectory_I  = D[['id','Timestamp', 'type', "Facility_coordinates"]].sort_values(by=['Timestamp']).to_numpy() 
    # Recorded the movement track of each ID for one day

    Len1 = len(Trajectory_I)
    # What is recorded is the number of activities recorded by each ID, If the Trajectory length of ID A is 1784, it means that the number of data recorded of this ID is 1784
    
    for index2, ID2 in enumerate(Firday_III.id.unique()[index:]):
        if index2 > 10: #  Control the number, only calculate the relationship between the ID and the other x people at one time, x=10: calculate the time and distance between the ID and the next ten people one by one
            break
        if ID == ID2: # It is used to skip the calculate with itself. When the ID is represented as same one, avoiding it.
            continue
            
        D2 = Firday_III.loc[Firday_III['id'] == ID2] 
        Trajectory_II = D2[['id','Timestamp', 'type', "Facility_coordinates"]].sort_values(by=['Timestamp']).to_numpy()
        Len2 = len(Trajectory_II)
        
        
        # Judgment point 1, if the number of activities is the same, the two IDs belong to the same small group, or can be classified as the same crowd
        if Len1 == Len2:
            track_1 =  pd.Series(list(Trajectory_I[:,3]))
            track_2 =  pd.Series(list(Trajectory_II[:,3]))
            distance = list( track_1.combine(track_2, (lambda x1, x2: 1 if np.linalg.norm(np.array(x1) - np.array(x2)) <= 2 else 0 )))
            Ddiff = distance.count(1)/len(distance)            
            # Calculate the relative position of the two ID during the movement (the Euclidean distance of the coordinates)
            
            #  Judgment point 2,the relative distance to each other is very close when activities in most time (above 70%）, 
            if Ddiff >= 0.7:
        
            # Tdiff is the median of time difference, that is, the median of all time differences calculated for each time point
                Tdiff = np.median(np.abs(Trajectory_I[:,1]-Trajectory_II[:,1]))
                # Judgment point 2, If the basic T value is less than about 60, there is a high probability that the two will act together

                if Tdiff <= 60:
                    Group.append(ID)
                    Group.append(ID2)
                    
                else:
                    Crowd.append(ID)
                    Crowd.append(ID2)

                    
        # If the number of activities recorded is different, the probability that the two are in a small group with each other is low, but it is also possible that they belong to the same crowd
        else:
            track_1 =  pd.Series(list(Trajectory_I[:,3]))
            track_2 =  pd.Series(list(Trajectory_II[:,3]))
            distance = list( track_1.combine(track_2, (lambda x1, x2: 1 if np.linalg.norm(np.array(x1) - np.array(x2)) <= 2 else 0 )))
            Ddiff = distance.count(1)/len(distance)
        
            if Ddiff >= 0.7:
                Crowd.append(ID)
                Crowd.append(ID2)

            else:
                Single.append(ID)
                Single.append(ID2)


In [None]:
import csv
with open("FridayGroupCount","w") as f:
    write = csv.writer(f)
    write.writerow(Group)
with open("FridayCrowdCount","w") as f:
    write = csv.writer(f)
    write.writerow(Crowd)
with open("FridaySingleCount","w") as f:
    write = csv.writer(f)
    write.writerow(Single)

In [None]:
d1 = pd.read_csv('FridayGroupCount.csv')
d2 = pd.read_csv('FridayCrowdCount.csv')
d3 = pd.read_csv('FridaySingleCount.csv')

Group = [  i.split(".", 1)[0] for i in d1.columns ]
Crowd = [  i.split(".", 1)[0] for i in d2.columns ]
Single = [  i.split(".", 1)[0] for i in d3.columns ]

# It takes more than 10 hours to run, so the results will be saved and still need to be cleaned up after reading.

In [None]:
Firday_Group = list(set(Group))
Firday_Crowd = list(set(Crowd)-set(Group))
Firday_Single=  list(set(list(set(Single)-set(Group)))-set(Crowd)) 
## Exclude duplicate values
## The sum of the three groups should be the number of individual ids in the dataset

In [None]:
len(Firday.id.unique()),len(Firday_Group), len(Firday_Crowd), len(Firday_Single)

### Criterion Two：Earliest arrival time grouping

In [None]:
TimePointMorning = time.mktime(time.strptime("2014-6-06 11:00:00", "%Y-%m-%d %H:%M:%S"))
TimePointAfterNoon = time.mktime(time.strptime("2014-6-06 16:00:00", "%Y-%m-%d %H:%M:%S"))
## View the time point in the first record of each visitor and calculate with the specified time point.
# If the calculation with TimePointMorning is positive and the calculation with TimePointAfterNoon is negative, then it will arrive after 11:00 and before 16:00
# If a positive number is calculated with TimePointAfterNoon, it will arrive after 16：00

ArrivalTime = Firday.copy().drop_duplicates(subset='id')
ArrivalTime["ArrivalTime"] = ArrivalTime["Timestamp"].map(lambda x: 1 if x- TimePointMorning >= 0 and x - TimePointAfterNoon <= 0 else ( 2 if x - TimePointAfterNoon >= 0 else 0 ) )
Early = list(ArrivalTime[ArrivalTime["ArrivalTime"]==0].id.unique())
Noon =  list(ArrivalTime[ArrivalTime["ArrivalTime"]==1].id.unique())
Late = list(ArrivalTime[ArrivalTime["ArrivalTime"]==2].id.unique())
len(Firday.id.unique()), len(Early), len(Noon), len(Late)

### Criterion Three：duration stay time

In [None]:
np.mean(Firday["visit_duration"].unique())
HighDuration = list(Firday[Firday["visit_duration"] >= np.mean(Firday["visit_duration"].unique())].id.unique())
LowDuration = list(Firday[Firday["visit_duration"] < np.mean(Firday["visit_duration"].unique())].id.unique())
## Visitors are divided into two groups based on the duration of their stay in the park, using the average as the standard
len(Firday.id.unique()), len(HighDuration), len(LowDuration)

#### Group 1

In [None]:
G1 = list(set(Firday_Group)&set(Early)&set(HighDuration))
G1P = Firday.loc[Firday['id'].isin(G1)]
len(G1)

In [None]:
px.density_heatmap(G1P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G1P.Timestamp, bins=range(G1P["Timestamp"].min(), G1P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 2

In [None]:
G2 = list(set(Firday_Crowd)&set(Early)&set( HighDuration))
G2P = Firday.loc[Firday['id'].isin(G2)]
len(G2)

In [None]:
px.density_heatmap(G2P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G2P.Timestamp, bins=range(G2P["Timestamp"].min(), G2P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 3

In [None]:
G3 = list(set(Firday_Single)&set(Early)&set(HighDuration))
G3P = Firday.loc[Firday['id'].isin(G3)]
len(G3)

In [None]:
px.density_heatmap(G3P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G3P.Timestamp, bins=range(G3P["Timestamp"].min(), G3P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 4

In [None]:
G4 = list(set(Firday_Group)&set(Noon))
G4P = Firday.loc[Firday['id'].isin(G4)]
len(G4) 

In [None]:
px.density_heatmap(G4P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G4P.Timestamp, bins=range(G4P["Timestamp"].min(), G4P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 5

In [None]:
G5 = list(set(Firday_Crowd)&set(Noon))
G5P = Firday.loc[Firday['id'].isin(G5)]
len(G5) 

In [None]:
px.density_heatmap(G5P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G5P.Timestamp, bins=range(G5P["Timestamp"].min(), G5P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 6

In [None]:
G6 = list(set(Firday_Single)&set(LowDuration))
G6P = Firday.loc[Firday['id'].isin(G6)]
len(G6) 

In [None]:
px.density_heatmap(G6P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G6P.Timestamp, bins=range(G6P["Timestamp"].min(), G6P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 7 

In [None]:
G7 = list(set(Firday_Group)&set(LowDuration)&set(Early))
G7P = Firday.loc[Firday['id'].isin(G7)]
len(G7)

In [None]:
px.density_heatmap(G7P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G7P.Timestamp, bins=range(G7P["Timestamp"].min(), G7P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

#### Group 8 

In [None]:
G8 = list(set(Firday_Group)&set(Late))
G8P = Firday.loc[Firday['id'].isin(G8)]
len(G8)

In [None]:
px.density_heatmap(G8P, x='X', y='Y')

In [None]:
counts, bins = np.histogram(G8P.Timestamp, bins=range(G8P["Timestamp"].min(), G8P["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

### Criterion Four, the time interval between the activities of each visitor

In [None]:
visitors_td = []
max_td = []
med_td = []
for index, ID in enumerate(Firday.id.unique()):
    visitors = Firday.loc[Firday['id'] == ID] 
    l = len(visitors['Timestamp'].values)
    diff  = abs(visitors['Timestamp'].values[1:] - Firday['Timestamp'].values[0:l-1])
    # Calculate the time difference between the tourist's first action and the next second action
    dmax = max(diff)
    dmed = np.median(diff)
    visitors_td.append(ID)
    max_td.append(dmax)
    med_td.append(dmed)

## max_td is a list that outputs the maximum value of the time difference according to the order in which the visitors_td are recorded. 
## For example, among all the actions of tourist A, the largest action interval is 100 seconds, then it is recorded as 100, and then tourist B. .....

In [None]:
import csv
with open("FridayGapID","w") as f:
    write = csv.writer(f)
    write.writerow(visitors_td)
with open("FridayGapMax","w") as f:
    write = csv.writer(f)
    write.writerow(max_td)
with open("FridayGapMed","w") as f:
    write = csv.writer(f)
    write.writerow(med_td)

##  Visitors with huge gaps between actions (top 150), sorted from largest to smallest

In [None]:
inds1 =  np.argsort(max_td)
GapH = np.array(visitors_td)[inds1[-150:]]
GapHV = Firday.loc[Firday['id'].isin(GapH)]
px.density_heatmap(GapHV, x=GapHV['X'], y=GapHV['Y'])

In [None]:
counts, bins = np.histogram(GapHV.Timestamp, bins=range(GapHV["Timestamp"].min(), GapHV["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

## Fast-moving tourists (low gap between actions)

In [None]:
inds2 =  np.argsort(med_td) 
Faster = np.array(visitors_td)[inds2[0:150]]
FasterV = Firday.loc[Firday['id'].isin(Faster)]
px.density_heatmap(FasterV, x=FasterV['X'], y=FasterV['Y'])

In [None]:
counts, bins = np.histogram(FasterV.Timestamp, bins=range(FasterV["Timestamp"].min(), FasterV["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

## slow-moving tourists (high gap between actions)

In [None]:
inds3 =  np.argsort(med_td)
Slower = np.array(visitors_td)[inds3[-150:]]
SlowerV = Firday.loc[Firday['id'].isin(Slower)]
px.density_heatmap(SlowerV, x=SlowerV['X'], y=SlowerV['Y'])

In [None]:
counts, bins = np.histogram(SlowerV.Timestamp, bins=range(SlowerV["Timestamp"].min(), SlowerV["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})

### Group EX

In [None]:
ShowTimeS = time.mktime(time.strptime("2014-6-06 09:00:00", "%Y-%m-%d %H:%M:%S"))
ShowTimeE = time.mktime(time.strptime("2014-6-06 12:00:00", "%Y-%m-%d %H:%M:%S"))

CheckPointM = Firday.copy()
SuspiciousIN = CheckPointM[   (CheckPointM["Timestamp"]>=ShowTimeS) & (CheckPointM["Timestamp"]<=ShowTimeE)                                                       ]
SuspiciousIN = SuspiciousIN[  (SuspiciousIN["X"]>=20) &  (SuspiciousIN["Y"]>=30) ]
SuspiciousIN = SuspiciousIN[  (SuspiciousIN["X"]<=40) &  (SuspiciousIN["Y"]<=40) ]

In [None]:
ShowTimeS2 = time.mktime(time.strptime("2014-6-06 14:00:00", "%Y-%m-%d %H:%M:%S"))
ShowTimeE2 = time.mktime(time.strptime("2014-6-06 17:00:00", "%Y-%m-%d %H:%M:%S"))

CheckPointA = Firday.copy()
SuspiciousIN2 = CheckPointA[   (CheckPointA["Timestamp"]>=ShowTimeS2) & (CheckPointA["Timestamp"]<=ShowTimeE2)                                                       ]
SuspiciousIN2 = SuspiciousIN2[  (SuspiciousIN2["X"]>=20) &  (SuspiciousIN2["Y"]>=30) ]
SuspiciousIN2 = SuspiciousIN2[  (SuspiciousIN2["X"]<=40) &  (SuspiciousIN2["Y"]<=40) ]

In [None]:
SuspiciousGroup =  list(set( list(SuspiciousIN.id.unique()) ) & set(list(SuspiciousIN2.id.unique()))) 
GSP = Firday.loc[Firday['id'].isin(SuspiciousGroup)]
len(SuspiciousGroup)

In [None]:
import csv
with open("FridayAreaCheck","w") as f:
    write = csv.writer(f)
    write.writerow(SuspiciousGroup)

In [None]:
px.density_heatmap(GSP, x='X', y='Y')

In [None]:
counts, bins = np.histogram(GSP.Timestamp, bins=range(GSP["Timestamp"].min(), GSP["Timestamp"].max(), 60))
bins = bins[1:]
px.line(['timestamp', 'count'], x=bins, y=counts, labels={'x':'timestamp', 'y':'count'})