In [None]:
import pandas as pd
import os

# Set the directory path where your TXT files are located
txt_files_directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\T-drive\\release\\taxi_log_2008_by_id'

# Set the directory path where you want to save the CSV files
csv_files_directory =  'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed'
# Ensure the CSV directory exists
os.makedirs(csv_files_directory, exist_ok=True)

# Define the column names for the new CSV files
column_names = ['taxi_id', 'date_time', 'longitude', 'latitude']

# Iterate over each file in the TXT files directory
for filename in os.listdir(txt_files_directory):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(txt_files_directory, filename)
        csv_file_path = os.path.join(csv_files_directory, filename.replace('.txt', '.csv'))

        # Read the TXT file
        try:
            df = pd.read_csv(txt_file_path, header=None, names=column_names)

            # Save the DataFrame to CSV
            df.to_csv(csv_file_path, index=False)
            print(f"Converted {filename} to CSV.")
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

print("All TXT files have been converted to CSV.")

In [None]:
import os
import pandas as pd

# Define the source and destination directories
source_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed'
destination_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed_split'

# List all CSV files in the source directory
csv_files = [file for file in os.listdir(source_dir) if file.endswith('.csv')]

# Process each CSV file
for file_name in csv_files:
    # Construct the full file path
    file_path = os.path.join(source_dir, file_name)
    # Load the dataset
    df = pd.read_csv(file_path)
    # Ensure date_time is in datetime format
    df['date_time'] = pd.to_datetime(df['date_time'])
    # Calculate the time difference between consecutive rows
    df['time_diff'] = df['date_time'].diff()
    # Identify indices where the time difference is greater than 30 minutes
    split_indices = df.index[df['time_diff'] > pd.Timedelta(minutes=30)].tolist()
    # Adjust split indices to include the first and last index
    split_indices = [0] + split_indices + [len(df)]

    # Prepare the original file's base name for naming split files
    original_file_base_name = os.path.splitext(file_name)[0]

    # Split the DataFrame and save each segment as a new CSV
    for i in range(len(split_indices) - 1):
        start_index = split_indices[i]
        end_index = split_indices[i + 1]
        segment = df.iloc[start_index:end_index]
        output_file_name = f'{original_file_base_name}_{i}.csv'
        output_file_path = os.path.join(destination_dir, output_file_name)
        segment.to_csv(output_file_path, index=False)

    print(f'Split {file_name} into {len(split_indices)-1} files.')

# Print completion message
print('All files processed and split according to the logic.')


In [None]:
import os
import random
import shutil

# Set the source and destination directories
source_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed_split'
destination_dir = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed_split_subset'

# Ensure reproducibility by setting a random seed
random.seed(42)

# Get a list of all files in the source directory
all_files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]

# Calculate 10% of the total number of files
num_files_to_select = int(len(all_files) * 0.05)

# Randomly select 10% of the files
selected_files = random.sample(all_files, num_files_to_select)

# Copy the selected files to the destination directory
for file in selected_files:
    source_path = os.path.join(source_dir, file)
    destination_path = os.path.join(destination_dir, file)
    shutil.copy2(source_path, destination_path)

print(f"Copied {len(selected_files)} files to {destination_dir}")


In [None]:
import os
import pandas as pd

# Specify the directory containing the CSV files
directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed_split_subset'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an identifier starting from 1
identifier = 1

# Iterate over each CSV file
for file_name in csv_files:
    # Construct the full file path
    file_path = os.path.join(directory, file_name)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Add a new column 'identifier' with all rows having the same identifier
    df['identifier'] = identifier
    
    # Save the modified DataFrame back to the same CSV file
    df.to_csv(file_path, index=False)
    
    # Increment the identifier for the next file
    identifier += 1

print('All files have been processed and updated with an identifier.')


In [None]:
import os
import pandas as pd


# Input directory containing CSV files
input_directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed_split_subset'

# Output directory where the merged CSV file will be saved
output_directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive'



# # List of important columns to keep
important_columns = ['taxi_id','identifier', 'date_time', 'longitude', 'latitude']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_all_utility_subset.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

In [None]:
df = pd.read_csv('C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\merged_all_utility_subset.csv')
df

In [2]:
import pandas as pd
import os

# Path to the CSV file
file_path = "C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\merged_all_utility_subset.csv"  # Update this path to your file's location

# Load the dataset
df = pd.read_csv(file_path)

# Convert 'time' column to datetime without specifying the format
df['time'] = pd.to_datetime(df['date_time'], errors='coerce')

# Calculate the minimum and maximum dates in the 'time' column
min_date = df['time'].min()
max_date = df['time'].max()

# Print the results
print(f"The earliest date of data logging is: {min_date}")
print(f"The latest date of data logging is: {max_date}")


The earliest date of data logging is: 2008-02-02 13:30:57
The latest date of data logging is: 2008-02-08 17:39:19


In [None]:
# import pandas as pd
# import folium
# from folium.plugins import HeatMap

# # Load your CSV file
# csv_file_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdirve\\merged_all_utility_subset.csv'
# df = pd.read_csv(csv_file_path)

# # Assuming your columns are named 'Latitude' and 'Longitude'
# latitude = 'latitude'
# longitude = 'longitude'

# # Create a map centered around the average location
# map_center = [df[latitude].mean(), df[longitude].mean()]
# m = folium.Map(location=map_center, zoom_start=10)

# # Create a HeatMap layer and add it to the map
# heat_data = [[row[latitude], row[longitude]] for index, row in df.iterrows()]
# HeatMap(heat_data).add_to(m)

# # Save the map to an HTML file
# output_html = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdirve\\map_tdrive.html'
# m.save(output_html)

# print(f"Heatmap saved to {output_html}")

In [3]:
import os
import pandas as pd
import numpy as np

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance_for_file(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Calculate the distances
    distances = [
        haversine(lon1, lat1, lon2, lat2) 
        for lat1, lon1, lat2, lon2 in zip(df['latitude'][:-1], df['longitude'][:-1], df['latitude'][1:], df['longitude'][1:])
    ]
    
    # Sum the distances
    return sum(distances), len(df)

def calculate_cumulative_distance_and_row_count(directory):
    
    total_distance = 0
    total_row_count = 0
    
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            distance, row_count = calculate_distance_for_file(file_path)
            total_distance += distance
            total_row_count += row_count

    average_distance =  (total_distance/ total_row_count)*1000
    return total_distance, average_distance

# Example usage
directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tdrive\\data\\utility'  # Replace this with the path to your directory
cumulative_distance, average_distance = calculate_cumulative_distance_and_row_count(directory)
print(f"Cumulative Distance Traversed: {cumulative_distance} km")
print(f"Average Distance: {average_distance} m")


Cumulative Distance Traversed: 4343434.605329861 km
Average Distance: 5367.732079376972 m


In [4]:
import pandas as pd
import os
import numpy as np

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Tdrive\\dataset_preprocessed_split_subset'  # Update this path

# Initialize a list to store the results
average_intervals = []

# Sum of all average intervals
total_average_interval = 0

# Count of files processed for the overall average calculation
file_count = 0

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    # Check if the file is a CSV
    if file_name.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Convert the 'time' column to datetime
        df['date_time'] = pd.to_datetime(df['date_time'])
        
        # Ensure the data is sorted by 'time'
        df = df.sort_values(by='date_time')
        
        # Calculate differences (intervals) between each timestamp
        df['sampling_interval_seconds'] = df['date_time'].diff().dt.total_seconds()
        
        # Ignore NaN values for the average calculation
        valid_intervals = df['sampling_interval_seconds'].dropna()
        
        if not valid_intervals.empty:
            # Calculate the average sampling interval
            average_sampling_interval = valid_intervals.mean()
            
            # Add the result to the list for individual file averages
            average_intervals.append((file_name, average_sampling_interval))
            
            # Update the total average and file count
            total_average_interval += average_sampling_interval
            file_count += 1
        else:
            # Handle files with insufficient data for interval calculation
            average_intervals.append((file_name, 'Insufficient data for interval calculation'))

# Calculate the overall average sampling interval across all files
overall_average_interval = total_average_interval / file_count if file_count > 0 else 'No valid data found in any file'

# Print the list of average sampling intervals for each file
for file_name, interval in average_intervals:
    print(f"{file_name}: {interval} seconds")

# Print the overall average sampling interval
print(f"\nOverall average sampling interval across all files: {overall_average_interval} seconds")

10001_7.csv: 232.33333333333334 seconds
10005_11.csv: 239.5084745762712 seconds
10005_15.csv: 148.6 seconds
10005_18.csv: 156.83333333333334 seconds
10008_5.csv: 175.93413173652695 seconds
1000_24.csv: 178.1 seconds
10017_11.csv: Insufficient data for interval calculation seconds
10017_9.csv: 298.9655172413793 seconds
10018_1.csv: 305.51950718685833 seconds
10019_5.csv: 416.42857142857144 seconds
10023_0.csv: 157.67796610169492 seconds
10023_14.csv: 141.66666666666666 seconds
1002_18.csv: 177.40816326530611 seconds
10030_13.csv: 101.70786516853933 seconds
10030_9.csv: 415.4 seconds
10032_25.csv: 204.6021505376344 seconds
10036_11.csv: 318.8809523809524 seconds
10038_20.csv: 301.8 seconds
10039_8.csv: 204.0 seconds
1003_10.csv: 176.0099502487562 seconds
10041_15.csv: 188.5 seconds
10045_4.csv: 159.92424242424244 seconds
10053_5.csv: Insufficient data for interval calculation seconds
10055_11.csv: 240.75 seconds
1005_12.csv: 165.66666666666666 seconds
1005_19.csv: 171.58510638297872 seco

In [None]:
########### Data Security tdrive

In [11]:
import os
import pandas as pd


# Input directory containing CSV files
input_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tdrive\\data\\utility'

# Output directory where the merged CSV file will be saved
output_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tdrive\\data'



# # List of important columns to keep
important_columns = ['identifier', 'longitude', 'latitude']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_all_utility_subset.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

Merged CSV file saved to C:\Users\ss6365\Desktop\location_privacy_final\tdrive\data\merged_all_utility_subset.csv


In [12]:
import numpy as np
import pandas as pd

df = pd.read_csv(r'C:\Users\ss6365\Desktop\location_privacy_final\tdrive\data\merged_all_utility_subset.csv')
central_lat = df['latitude'].median()
central_lon = df['longitude'].median()


In [13]:
central_lat

39.92308

In [14]:
central_lon

116.41439

In [15]:
import numpy as np


df = pd.read_csv(r'C:\Users\ss6365\Desktop\location_privacy_final\tdrive\data\merged_all_utility_subset.csv')

def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in km
    R = 6371.0
    # Convert coordinates from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def find_square_boundaries(lat, lon, distance_km):
    # Approximate conversions
    delta_lat = distance_km / 111  # 111 km per degree of latitude
    delta_lon = distance_km / (111 * np.cos(np.radians(lat)))  # Adjust for longitude
    return lat - delta_lat, lat + delta_lat, lon - delta_lon, lon + delta_lon


# Calculate the median (or mean) latitude and longitude
central_lat = df['latitude'].median()
central_lon = df['longitude'].median()


# Define the square region boundaries
lat_min, lat_max, lon_min, lon_max = find_square_boundaries(central_lat, central_lon, 3)

# Filter the DataFrame for points within the 1 km square
df_limit = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
               (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

df_limit

Unnamed: 0,longitude,latitude,identifier
39,116.39714,39.94757,1
83,116.40070,39.90440,2
84,116.40255,39.91160,2
85,116.40261,39.91164,2
86,116.40044,39.90342,2
...,...,...,...
809077,116.42419,39.93530,6947
809078,116.43078,39.93959,6947
809079,116.43711,39.93974,6947
809102,116.41907,39.94891,6947


In [16]:
Horizontal_check = haversine(df_limit['latitude'].min(), df_limit['longitude'].min(), df_limit['latitude'].min(), df_limit['longitude'].max())
Horizontal_check

veritcal_check = haversine(df_limit['latitude'].min(), df_limit['longitude'].min(), df_limit['latitude'].max(), df_limit['longitude'].min())
veritcal_check

6.008973835871767

In [17]:
import glob
import os
import pandas as pd

# Input and output directories
input_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\tdrive\data\utility'
output_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\tdrive\data\security'


# Calculate the boundaries based on the current file
lat_min = df_limit['latitude'].min()
lat_max = df_limit['latitude'].max()
lon_min = df_limit['longitude'].min()
lon_max = df_limit['longitude'].max()


# Iterate through CSV files in the input directory
for csv_file in glob.glob(os.path.join(input_directory, '*.csv')):
    # Load the CSV file
    df = pd.read_csv(csv_file)



    # Distance parameter (can be adjusted as needed)
    distance_km = 3
    # Define the square region boundaries and filter the DataFrame
    df_square = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
                   (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

    # Check if the filtered DataFrame is empty (no data within boundaries)
    if df_square.empty:
        continue  # Skip saving if no data matches the criteria

    # Extract the base filename without extension
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    
    # Create the new filename with distance_km
    new_filename = f"{base_filename}_{distance_km}km.csv"
    
    # Save the filtered DataFrame to the output directory with the new filename
    output_path = os.path.join(output_directory, new_filename)
    df_square.to_csv(output_path, index=False)

print("Processing complete.")

Processing complete.


In [18]:
import os
import pandas as pd


# Input directory containing CSV files
input_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tdrive\\data\\security'

# Output directory where the merged CSV file will be saved
output_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\tdrive\\data'



# # List of important columns to keep
important_columns = ['identifier', 'longitude', 'latitude']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_all_security_subset_3km.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

Merged CSV file saved to C:\Users\ss6365\Desktop\location_privacy_final\tdrive\data\merged_all_security_subset_3km.csv
