In [None]:
import os
import pandas as pd

def convert_plt_to_csv_and_delete(root_directory):
    for subdir, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".plt"):
                plt_file_path = os.path.join(subdir, file)
                # Assuming the first 6 lines are headers or irrelevant for the CSV
                # Adjust the number of rows to skip (skiprows) based on your .plt file's format
                data = pd.read_csv(plt_file_path, skiprows=6, header=None)
                csv_file_path = plt_file_path.rsplit('.', 1)[0] + '.csv'
                data.to_csv(csv_file_path, index=False)
                os.remove(plt_file_path)
                print(f"Converted and deleted: {plt_file_path}")

# Replace 'your_directory_path_here' with the path to your directory
root_directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\Geolife Trajectories 1.3\\Geolife Trajectories 1.3\\Data'
convert_plt_to_csv_and_delete(root_directory)


In [None]:
import os
import pandas as pd

def rename_csv_columns(root_directory):
    # Define the new column names
    new_column_names = ['latitude', 'longitude', 'all', 'altitude', 'data_as_number_of_day_since', 'Date_as_string', 'time']
    
    for subdir, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".csv"):
                csv_file_path = os.path.join(subdir, file)
                # Read the CSV file
                data = pd.read_csv(csv_file_path)
                # Check if the number of columns matches
                if len(data.columns) == 7:
                    # Rename the columns
                    data.columns = new_column_names
                    # Save the modified CSV file
                    data.to_csv(csv_file_path, index=False)
                    print(f"Columns renamed for: {csv_file_path}")
                else:
                    print(f"Column count mismatch in {csv_file_path}. Expected 7 columns, found {len(data.columns)}.")

# Replace 'your_directory_path_here' with the path to your directory
root_directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\Geolife Trajectories 1.3\\Geolife Trajectories 1.3\\Data'
rename_csv_columns(root_directory)


In [None]:
import os
import shutil

def copy_csv_files_to_target_folder(root_directory, target_folder):
    # Counter for filename conflicts
    file_counter = {}
    
    for subdir, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".csv"):
                source_file_path = os.path.join(subdir, file)
                target_file_path = os.path.join(target_folder, file)
                
                # Check if the file already exists in the target folder
                if os.path.exists(target_file_path):
                    # If it exists, create a new filename to avoid overwriting
                    if file in file_counter:
                        file_counter[file] += 1
                    else:
                        file_counter[file] = 1
                    name, ext = os.path.splitext(file)
                    new_filename = f"{name}_{file_counter[file]}{ext}"
                    target_file_path = os.path.join(target_folder, new_filename)
                
                # Copy the file to the target folder
                shutil.copy2(source_file_path, target_file_path)
                print(f"Copied {source_file_path} to {target_file_path}")

# Replace 'your_root_directory_path_here' with the path to your root directory
root_directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\Geolife Trajectories 1.3\\Geolife Trajectories 1.3\\Data'
# Replace 'your_target_folder_path_here' with the path to your target folder
target_folder = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed'
copy_csv_files_to_target_folder(root_directory, target_folder)


In [None]:
import os
import pandas as pd

# Define the source and destination directories
source_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed'
destination_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed_split'

# Function to process and split each CSV file
def process_and_split_csv(file_path, destination_dir):
    df = pd.read_csv(file_path)
    
    # Combine 'Date_as_string' and 'time' columns into a 'datetime' column
    df['datetime'] = pd.to_datetime(df['Date_as_string'] + ' ' + df['time'])
    
    # Calculate the time difference between consecutive rows
    df['time_diff'] = df['datetime'].diff()
    
    # Identify indices where the time difference is greater than 30 minutes
    split_indices = df.index[df['time_diff'] > pd.Timedelta(minutes=30)].tolist()
    # Adjust split indices to include the first and last index
    split_indices = [0] + split_indices + [len(df)]
    
    original_file_base_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Split and save each segment
    for i in range(len(split_indices) - 1):
        start_index = split_indices[i]
        end_index = split_indices[i + 1]
        segment = df.iloc[start_index:end_index]
        output_file_name = f'{original_file_base_name}_{i}.csv'
        output_file_path = os.path.join(destination_dir, output_file_name)
        segment.to_csv(output_file_path, index=False, columns=df.columns[:-2])  # Exclude the last two columns ('datetime', 'time_diff')
    
    print(f'Split {os.path.basename(file_path)} into {len(split_indices)-1} files.')

# Iterate over all CSV files in the source directory and process them
for file_name in os.listdir(source_dir):
    if file_name.endswith(".csv"):
        file_path = os.path.join(source_dir, file_name)
        process_and_split_csv(file_path, destination_dir)

print('All files processed and split according to the new logic.')


In [None]:
import os
import random
import shutil

# Set the source and destination directories
source_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed_split'
destination_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed_split_subset'

# Ensure reproducibility by setting a random seed
random.seed(42)

# Get a list of all files in the source directory
all_files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]

# Calculate 10% of the total number of files
num_files_to_select = int(len(all_files) * 0.05)

# Randomly select 10% of the files
selected_files = random.sample(all_files, num_files_to_select)

# Copy the selected files to the destination directory
for file in selected_files:
    source_path = os.path.join(source_dir, file)
    destination_path = os.path.join(destination_dir, file)
    shutil.copy2(source_path, destination_path)

print(f"Copied {len(selected_files)} files to {destination_dir}")


In [None]:
import os
import pandas as pd

# Specify the directory containing the CSV files
directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed_split_subset'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an identifier starting from 1
identifier = 1

# Iterate over each CSV file
for file_name in csv_files:
    # Construct the full file path
    file_path = os.path.join(directory, file_name)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Add a new column 'identifier' with all rows having the same identifier
    df['identifier'] = identifier
    
    # Save the modified DataFrame back to the same CSV file
    df.to_csv(file_path, index=False)
    
    # Increment the identifier for the next file
    identifier += 1

print('All files have been processed and updated with an identifier.')


In [2]:
import os
import pandas as pd
import numpy as np

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance_for_file(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Calculate the distances
    distances = [
        haversine(lon1, lat1, lon2, lat2) 
        for lat1, lon1, lat2, lon2 in zip(df['latitude'][:-1], df['longitude'][:-1], df['latitude'][1:], df['longitude'][1:])
    ]
    
    # Sum the distances
    return sum(distances), len(df)

def calculate_cumulative_distance_and_row_count(directory):
    
    total_distance = 0
    total_row_count = 0
    
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            distance, row_count = calculate_distance_for_file(file_path)
            total_distance += distance
            total_row_count += row_count

    average_distance =  (total_distance/ total_row_count)*1000
    return total_distance, average_distance

# Example usage
directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\geolife\\data\\utility'  # Replace this with the path to your directory
cumulative_distance, average_distance = calculate_cumulative_distance_and_row_count(directory)
print(f"Cumulative Distance Traversed: {cumulative_distance} km")
print(f"Average Distance: {average_distance} m")


Cumulative Distance Traversed: 23061.329459818542 km
Average Distance: 19.323166319482965 m


In [3]:
import pandas as pd
import os
import numpy as np

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Geolife\\dataset_preprocessed_split_subset'  # Update this path

# Initialize a list to store the results
average_intervals = []

# Sum of all average intervals
total_average_interval = 0

# Count of files processed for the overall average calculation
file_count = 0

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    # Check if the file is a CSV
    if file_name.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Combine 'Date_as_string' and 'time' columns into a datetime format
        df['date_time'] = pd.to_datetime(df['Date_as_string'] + ' ' + df['time'])
        
        # Ensure the data is sorted by 'date_time'
        df = df.sort_values(by='date_time')
        
        # Calculate differences (intervals) between each timestamp
        df['sampling_interval_seconds'] = df['date_time'].diff().dt.total_seconds()
        
        # Ignore NaN values for the average calculation
        valid_intervals = df['sampling_interval_seconds'].dropna()
        
        if not valid_intervals.empty:
            # Calculate the average sampling interval
            average_sampling_interval = valid_intervals.mean()
            
            # Add the result to the list for individual file averages
            average_intervals.append((file_name, average_sampling_interval))
            
            # Update the total average and file count
            total_average_interval += average_sampling_interval
            file_count += 1
        else:
            # Handle files with insufficient data for interval calculation
            average_intervals.append((file_name, 'Insufficient data for interval calculation'))

# Calculate the overall average sampling interval across all files
overall_average_interval = total_average_interval / file_count if file_count > 0 else 'No valid data found in any file'

# Print the list of average sampling intervals for each file
for file_name, interval in average_intervals:
    print(f"{file_name}: {interval} seconds")

# Print the overall average sampling interval
print(f"\nOverall average sampling interval across all files: {overall_average_interval} seconds")


20070413013238_2.csv: 40.82608695652174 seconds
20070413221434_0.csv: 41.6390977443609 seconds
20070416032733_0.csv: 11.09375 seconds
20070417144619_0.csv: 26.27777777777778 seconds
20070418133622_0.csv: 25.6 seconds
20070420010223_0.csv: 34.73913043478261 seconds
20070420041022_0.csv: 11.15 seconds
20070421035851_0.csv: Insufficient data for interval calculation seconds
20070423013051_0.csv: 18.72 seconds
20070502051941_0.csv: 28.614285714285714 seconds
20070504235730_1.csv: 37.660377358490564 seconds
20070507052941_0.csv: 22.8 seconds
20070511092102_1.csv: 25.307692307692307 seconds
20070514105835_0.csv: 57.741379310344826 seconds
20070521142156_0.csv: 18.282608695652176 seconds
20070524103937_1.csv: 17.52 seconds
20070526110036_0.csv: 23.857142857142858 seconds
20070604103524_1.csv: 37.3828125 seconds
20070609152605_0.csv: 21.928571428571427 seconds
20070612124105_0.csv: 25.944444444444443 seconds
20070616065209_4.csv: 64.0 seconds
20070619005814_0.csv: 6.5 seconds
20070622125528_0.

In [None]:
########################## Data Security

In [4]:
import os
import pandas as pd


# Input directory containing CSV files
input_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\geolife\\data\\utility'

# Output directory where the merged CSV file will be saved
output_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\geolife\\data'



# # List of important columns to keep
important_columns = ['identifier', 'longitude', 'latitude']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_all_utility_subset.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

Merged CSV file saved to C:\Users\ss6365\Desktop\location_privacy_final\geolife\data\merged_all_utility_subset.csv


In [5]:
import numpy as np


df = pd.read_csv(r'C:\Users\ss6365\Desktop\location_privacy_final\geolife\data\merged_all_utility_subset.csv')

def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in km
    R = 6371.0
    # Convert coordinates from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def find_square_boundaries(lat, lon, distance_km):
    # Approximate conversions
    delta_lat = distance_km / 111  # 111 km per degree of latitude
    delta_lon = distance_km / (111 * np.cos(np.radians(lat)))  # Adjust for longitude
    return lat - delta_lat, lat + delta_lat, lon - delta_lon, lon + delta_lon


# Calculate the median (or mean) latitude and longitude
central_lat = df['latitude'].median()
central_lon = df['longitude'].median()


# Define the square region boundaries
lat_min, lat_max, lon_min, lon_max = find_square_boundaries(central_lat, central_lon, 2)

# Filter the DataFrame for points within the 1 km square
df_limit = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
               (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

df_limit

Unnamed: 0,latitude,longitude,identifier
0,39.991650,116.327567,1
1,39.992233,116.326083,1
2,39.992317,116.325750,1
3,39.992067,116.325667,1
4,39.991850,116.325917,1
...,...,...,...
1193227,39.984544,116.318676,1524
1193228,39.984533,116.317468,1524
1193229,39.984492,116.316219,1524
1193230,39.984461,116.314947,1524


In [6]:
# Horizontal_check = haversine(df_limit['latitude'].min(), df_limit['longitude'].min(), df_limit['latitude'].min(), df_limit['longitude'].max())
# Horizontal_check

# veritcal_check = haversine(df_limit['latitude'].min(), df_limit['longitude'].min(), df_limit['latitude'].max(), df_limit['longitude'].min())
# veritcal_check

In [7]:
import glob
import os
import pandas as pd

# Input and output directories
input_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\geolife\data\utility'
output_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\geolife\data\security'


# Calculate the boundaries based on the current file
lat_min = df_limit['latitude'].min()
lat_max = df_limit['latitude'].max()
lon_min = df_limit['longitude'].min()
lon_max = df_limit['longitude'].max()


# Iterate through CSV files in the input directory
for csv_file in glob.glob(os.path.join(input_directory, '*.csv')):
    # Load the CSV file
    df = pd.read_csv(csv_file)



    # Distance parameter (can be adjusted as needed)
    distance_km = 2

    # Define the square region boundaries and filter the DataFrame
    df_square = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
                   (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

    # Check if the filtered DataFrame is empty (no data within boundaries)
    if df_square.empty:
        continue  # Skip saving if no data matches the criteria

    # Extract the base filename without extension
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    
    # Create the new filename with distance_km
    new_filename = f"{base_filename}_{distance_km}km.csv"
    
    # Save the filtered DataFrame to the output directory with the new filename
    output_path = os.path.join(output_directory, new_filename)
    df_square.to_csv(output_path, index=False)

print("Processing complete.")

Processing complete.
