In [None]:
import os
import shutil

# Replace these paths with your actual directory paths
source_directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_location-main\\dataset_location-main'
destination_directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Ensure the destination directory exists
os.makedirs(destination_directory_path, exist_ok=True)

# Walk through the source directory
for root, dirs, files in os.walk(source_directory_path):
    # Check if we're inside the Speed_data directory, but not directly in it
    if 'Speed_data' in root and root.strip('\\').endswith('Speed_data'):
        # Only process files in Speed_data, do not go into its subdirectories
        dirs[:] = []  # This clears the dirs list in-place, preventing os.walk from going into any subdirectories
    for file in files:
        if file.endswith(".csv"):
            # Construct the original full path of the file
            original_file_path = os.path.join(root, file)
            # Extract the subdirectory name
            subdirectory_name = os.path.basename(os.path.normpath(root))
            # Construct the new file name and path
            new_file_name = f"{subdirectory_name}_{file}"
            destination_file_path = os.path.join(destination_directory_path, new_file_name)
            # Copy the file
            shutil.copy(original_file_path, destination_file_path)

print("CSV files have been copied and renamed successfully.")


In [None]:
import os
import pandas as pd

# Replace this path with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            # Load the CSV file
            df = pd.read_csv(file_path)
            # Check if the number of columns is not exactly 17
            if df.shape[1] != 17:
                os.remove(file_path)
                print(f"Deleted: {file_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete.")


In [None]:
import os
import pandas as pd

# Replace this with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Replace these with your desired column names
column_names = ['deviceID', 'date', 'longitude', 'latitude', 'altitude', 'accelerometer x'
                , 'accelerometer y', 'accelerometer z', 'userAccelerometer x', 'userAccelerometer y'
                , 'userAccelerometer z', 'gyroscope x', 'gyroscope y', 'gyroscope z', 'magnetometer x', 'magnetometer y', 'magnetometer z'
]

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            # Read the CSV file without header
            df = pd.read_csv(file_path, header=None)
            # Assign the column names
            df.columns = column_names
            # Save the file back
            df.to_csv(file_path, index=False)
            print(f"Processed: {file_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete.")


In [None]:
import os
import pandas as pd

# Replace this with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Names of the latitude and longitude columns
latitude_column_name = 'latitude'
longitude_column_name = 'longitude'

# Check if a value is within the latitude range
def is_valid_latitude(lat):
    return -90 <= lat <= 90

# Check if a value is within the longitude range
def is_valid_longitude(lon):
    return -180 <= lon <= 180

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            df = pd.read_csv(file_path)
            
            # Check if latitude and longitude columns exist
            if latitude_column_name in df.columns and longitude_column_name in df.columns:
                # Apply validity checks
                invalid_latitudes = df[~df[latitude_column_name].apply(is_valid_latitude)]
                invalid_longitudes = df[~df[longitude_column_name].apply(is_valid_longitude)]
                
                # Report findings
                if not invalid_latitudes.empty or not invalid_longitudes.empty:
                    print(f"File '{file_name}' contains invalid latitude or longitude values:")
                    if not invalid_latitudes.empty:
                        print(f"Invalid latitudes:\n{invalid_latitudes[[latitude_column_name]]}")
                    if not invalid_longitudes.empty:
                        print(f"Invalid longitudes:\n{invalid_longitudes[[longitude_column_name]]}")
                else:
                    print(f"File '{file_name}' has all valid latitude and longitude values.")
            else:
                print(f"File '{file_name}' does not contain the specified latitude and longitude columns.")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete.")


In [None]:
import os
import pandas as pd

# Replace this with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Names of the latitude and longitude columns
latitude_column_name = 'latitude'
longitude_column_name = 'longitude'

# Check if a value is within the latitude range
def is_valid_latitude(lat):
    return -90 <= lat <= 90

# Check if a value is within the longitude range
def is_valid_longitude(lon):
    return -180 <= lon <= 180

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            df = pd.read_csv(file_path)
            
            # Check if latitude and longitude columns exist
            if latitude_column_name in df.columns and longitude_column_name in df.columns:
                # Apply validity checks
                invalid_latitudes = df[~df[latitude_column_name].apply(is_valid_latitude)]
                invalid_longitudes = df[~df[longitude_column_name].apply(is_valid_longitude)]
                
                # If there are any invalid values, delete the file
                if not invalid_latitudes.empty or not invalid_longitudes.empty:
                    os.remove(file_path)
                    print(f"Deleted '{file_name}' due to invalid latitude or longitude values.")
            else:
                print(f"File '{file_name}' does not contain the specified latitude and longitude columns.")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete. Invalid files have been deleted.")


In [None]:
import os
import pandas as pd

# Replace this with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# List the names of the columns you want to keep
important_columns = ['date', 'longitude', 'latitude']

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Keep only the important columns, if they exist in the dataframe
            df_filtered = df[important_columns] if all(col in df for col in important_columns) else df
            
            # Save the filtered dataframe back to the same file, overwriting the original
            df_filtered.to_csv(file_path, index=False)
            print(f"Processed and saved: {file_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete. Files have been overwritten with only important columns.")


In [None]:
import os
import pandas as pd

# Replace this with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Names of the latitude and longitude columns
latitude_column_name = 'latitude'
longitude_column_name = 'longitude'

# Function to check for NaN values or zeros
def has_invalid_values(series):
    return series.isnull().any() or (series == 0).any()

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            df = pd.read_csv(file_path)
            
            # Check if latitude and longitude columns exist
            if latitude_column_name in df and longitude_column_name in df:
                # Check for NaN values or zeros in both columns
                invalid_latitudes = has_invalid_values(df[latitude_column_name])
                invalid_longitudes = has_invalid_values(df[longitude_column_name])
                
                if invalid_latitudes or invalid_longitudes:
                    print(f"File '{file_name}' contains NaN or zero values:")
                    if invalid_latitudes:
                        print(f" - Invalid values in latitude column")
                    if invalid_longitudes:
                        print(f" - Invalid values in longitude column")
                else:
                    print(f"File '{file_name}' has valid latitude and longitude values.")
            else:
                print(f"File '{file_name}' does not contain the specified latitude and longitude columns.")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete.")


In [None]:
import os
import pandas as pd

# Replace this with the path to your directory
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# Names of the latitude and longitude columns
latitude_column_name = 'latitude'
longitude_column_name = 'longitude'

# Function to check for NaN values or zeros
def has_invalid_values(series):
    return series.isnull().any() or (series == 0).any()

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory_path, file_name)
        try:
            df = pd.read_csv(file_path)
            
            # Check if latitude and longitude columns exist
            if latitude_column_name in df and longitude_column_name in df:
                # Check for NaN values or zeros in both columns
                invalid_latitudes = has_invalid_values(df[latitude_column_name])
                invalid_longitudes = has_invalid_values(df[longitude_column_name])
                
                if invalid_latitudes or invalid_longitudes:
                    # Delete the file
                    os.remove(file_path)
                    print(f"Deleted '{file_name}' due to invalid latitude or longitude values.")
            else:
                print(f"File '{file_name}' does not contain the specified latitude and longitude columns.")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

print("Processing complete. Invalid files have been deleted.")


In [None]:
# import os
# import pandas as pd

# # Replace this with the path to your directory
# directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\test'

# # Names of the latitude and longitude columns
# latitude_column_name = 'latitude'
# longitude_column_name = 'longitude'

# # Function to remove consecutive duplicates
# def remove_consecutive_duplicates(df, column_name):
#     # Identify rows where the value in the specified column is equal to that in the next row
#     duplicates = df[column_name].eq(df[column_name].shift())
#     # Keep rows where there is no such duplication
#     return df[~duplicates]

# # Iterate over each file in the directory
# for file_name in os.listdir(directory_path):
#     if file_name.endswith(".csv"):
#         file_path = os.path.join(directory_path, file_name)
#         try:
#             df = pd.read_csv(file_path)
            
#             # Ensure both specified columns exist
#             if latitude_column_name in df.columns and longitude_column_name in df.columns:
#                 # Remove consecutive duplicates for latitude and longitude
#                 df = remove_consecutive_duplicates(df, latitude_column_name)
#                 df = remove_consecutive_duplicates(df, longitude_column_name)
                
#                 # Save the modified DataFrame back to the CSV, overwriting the original
#                 df.to_csv(file_path, index=False)
#                 print(f"Processed and updated: {file_name}")
#             else:
#                 print(f"File '{file_name}' does not contain the specified latitude and longitude columns.")
#         except Exception as e:
#             print(f"Error processing {file_name}: {e}")

# print("Processing complete. Consecutive duplicate rows have been removed.")


In [None]:
import os
import pandas as pd
from dateutil.parser import parse

def infer_date_format(date_str):
    try:
        parsed_date = parse(date_str, fuzzy=True)
        return parsed_date.strftime('%Y-%m-%d %H:%M:%S')
    except ValueError:
        return None

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed_remove_duplicate_rows'

# Initialize a dictionary to store inferred date formats
date_formats = {}

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        # Attempt to load the CSV file
        try:
            df = pd.read_csv(file_path)

            # Assuming 'date' is the column name. Adjust as necessary.
            if 'date' in df.columns:
                # Find the first non-null value in the date column
                first_date_str = df['date'].dropna().iloc[0]
                # Infer the date format
                inferred_format = infer_date_format(first_date_str)
                date_formats[file_name] = inferred_format
            else:
                date_formats[file_name] = 'Date column not found'
        except Exception as e:
            date_formats[file_name] = f'Error reading file: {e}'

# Print the inferred date formats for each file
for file_name, date_format in date_formats.items():
    print(f"{file_name}: {date_format}")


In [5]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

def correct_24_hour_time(date_str):
    # Check if the time starts with "24"
    if date_str[-8:-6] == "24":
        # Replace "24" with "00"
        corrected_str = date_str[:-8] + "00" + date_str[-6:]
        # Convert to datetime
        corrected_datetime = pd.to_datetime(corrected_str)
        # Add a day to the date
        corrected_datetime += timedelta(days=1)
        return corrected_datetime
    else:
        # If there's no "24" hour, just convert to datetime
        return pd.to_datetime(date_str)

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed_remove_duplicate_rows_without_flights'

# Initialize a list to store the results
average_intervals = []

# Sum of all average intervals
total_average_interval = 0

# Count of files processed for the overall average calculation
file_count = 0

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        if 'date' in df.columns:
            # Apply the correction for the "24:00:XX" case and convert to datetime
            df['date'] = df['date'].apply(correct_24_hour_time)
            
            # Ensure the data is sorted by 'date'
            df = df.sort_values(by='date')
            
            # Calculate differences (intervals) between each timestamp
            df['sampling_interval_seconds'] = df['date'].diff().dt.total_seconds()
            
            # Ignore NaN values for the average calculation
            valid_intervals = df['sampling_interval_seconds'].dropna()
            
            if not valid_intervals.empty:
                average_sampling_interval = valid_intervals.mean()
                average_intervals.append((file_name, average_sampling_interval))
                total_average_interval += average_sampling_interval
                file_count += 1
            else:
                average_intervals.append((file_name, 'Insufficient data for interval calculation'))

overall_average_interval = total_average_interval / file_count if file_count > 0 else 'No valid data found in any file'

for file_name, interval in average_intervals:
    print(f"{file_name}: {interval} seconds")

print(f"\nOverall average sampling interval across all files: {overall_average_interval} seconds")


Hu_Samsung_S22_csv-1706551395849.csv: 1.2313019390581716 seconds
Hu_Samsung_S22_csv-1706554333642.csv: 1.8215527230590962 seconds
Hu_Samsung_S22_csv-1706556488903.csv: 2.0304373522458627 seconds
Hu_Samsung_S22_csv-1706629866070.csv: 14.495664739884393 seconds
Hu_Samsung_S22_csv-1706645959366.csv: 15.088645262333594 seconds
Hu_Samsung_S22_csv-1706648412755.csv: 13.77317719224871 seconds
Hu_Samsung_S22_csv-1706719083524.csv: 19.207185764479203 seconds
Hu_Samsung_S22_csv-1707167485257.csv: 0.7369337979094077 seconds
Hu_Samsung_S22_csv-1707168946553.csv: 1.4846335697399526 seconds
Hu_Samsung_S22_csv-1707169047177.csv: 1.3952180028129395 seconds
Hu_Samsung_S22_csv-1707169378518.csv: 4.142857142857143 seconds
Hu_Samsung_S22_csv-1707169583100.csv: 1.7659425367904695 seconds
Hu_Samsung_S22_csv-1707171684371.csv: 43.113207547169814 seconds
Hu_Samsung_S22_csv-1707183508643.csv: 0.8452173913043478 seconds
Hu_Samsung_S22_csv-1707228280586.csv: 54.99149453219927 seconds
Hu_Samsung_S22_csv-170722923

In [11]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

def correct_24_hour_time(date_str):
    # Check if the time starts with "24"
    if date_str[-8:-6] == "24":
        # Replace "24" with "00"
        corrected_str = date_str[:-8] + "00" + date_str[-6:]
        # Convert to datetime
        corrected_datetime = pd.to_datetime(corrected_str)
        # Add a day to the date
        corrected_datetime += timedelta(days=1)
        return corrected_datetime
    else:
        # If there's no "24" hour, just convert to datetime
        return pd.to_datetime(date_str)

# Specify the directory containing your CSV files
directory_path = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed_remove_duplicate_rows'

# Initialize a list to store the results
average_intervals = []

# Sum of all average intervals
total_average_interval = 0

# Count of files processed for the overall average calculation
file_count = 0

# Iterate over each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        if 'date' in df.columns:
            # Apply the correction for the "24:00:XX" case and convert to datetime
            df['date'] = df['date'].apply(correct_24_hour_time)
            
            # Ensure the data is sorted by 'date'
            df = df.sort_values(by='date')
            
            # Calculate differences (intervals) between each timestamp
            df['sampling_interval_seconds'] = df['date'].diff().dt.total_seconds()
            
            # Ignore NaN values for the average calculation
            valid_intervals = df['sampling_interval_seconds'].dropna()
            
            if not valid_intervals.empty:
                average_sampling_interval = valid_intervals.mean()
                average_intervals.append((file_name, average_sampling_interval))
                total_average_interval += average_sampling_interval
                file_count += 1
            else:
                average_intervals.append((file_name, 'Insufficient data for interval calculation'))

overall_average_interval = total_average_interval / file_count if file_count > 0 else 'No valid data found in any file'

for file_name, interval in average_intervals:
    print(f"{file_name}: {interval} seconds")

print(f"\nOverall average sampling interval across all files: {overall_average_interval} seconds")


Hu_Samsung_S22_csv-1706551395849.csv: 1.2313019390581716 seconds
Hu_Samsung_S22_csv-1706554333642.csv: 1.8215527230590962 seconds
Hu_Samsung_S22_csv-1706556488903.csv: 2.0304373522458627 seconds
Hu_Samsung_S22_csv-1706629866070.csv: 14.495664739884393 seconds
Hu_Samsung_S22_csv-1706645959366.csv: 15.088645262333594 seconds
Hu_Samsung_S22_csv-1706648412755.csv: 13.77317719224871 seconds
Hu_Samsung_S22_csv-1706719083524.csv: 19.207185764479203 seconds
Hu_Samsung_S22_csv-1707167485257.csv: 0.7369337979094077 seconds
Hu_Samsung_S22_csv-1707168946553.csv: 1.4846335697399526 seconds
Hu_Samsung_S22_csv-1707169047177.csv: 1.3952180028129395 seconds
Hu_Samsung_S22_csv-1707169378518.csv: 4.142857142857143 seconds
Hu_Samsung_S22_csv-1707169583100.csv: 1.7659425367904695 seconds
Hu_Samsung_S22_csv-1707171684371.csv: 43.113207547169814 seconds
Hu_Samsung_S22_csv-1707183508643.csv: 0.8452173913043478 seconds
Hu_Samsung_S22_csv-1707228280586.csv: 54.99149453219927 seconds
Hu_Samsung_S22_csv-170722923

In [10]:
import os
import pandas as pd
import numpy as np

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance_for_file(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Calculate the distances
    distances = [
        haversine(lon1, lat1, lon2, lat2) 
        for lat1, lon1, lat2, lon2 in zip(df['latitude'][:-1], df['longitude'][:-1], df['latitude'][1:], df['longitude'][1:])
    ]
    
    # Sum the distances
    return sum(distances), len(df)

def calculate_cumulative_distance_and_row_count(directory):
    
    total_distance = 0
    total_row_count = 0
    
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            distance, row_count = calculate_distance_for_file(file_path)
            total_distance += distance
            total_row_count += row_count

    average_distance =  (total_distance/ total_row_count)*1000
    return total_distance, average_distance

# Example usage
directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed_remove_duplicate_rows'  # Replace this with the path to your directory
cumulative_distance, average_distance = calculate_cumulative_distance_and_row_count(directory)
print(f"Cumulative Distance Traversed: {cumulative_distance} km")
print(f"Average Distance: {average_distance} m")


Cumulative Distance Traversed: 8473.671461837745 km
Average Distance: 66.48728471092323 m


In [None]:
import os
import pandas as pd
import numpy as np

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def calculate_distance_for_file(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Calculate the distances
    distances = [
        haversine(lon1, lat1, lon2, lat2) 
        for lat1, lon1, lat2, lon2 in zip(df['latitude'][:-1], df['longitude'][:-1], df['latitude'][1:], df['longitude'][1:])
    ]
    
    # Sum the distances
    return sum(distances)

def calculate_cumulative_distance(directory):
    total_distance = 0
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            total_distance += calculate_distance_for_file(file_path)
    return total_distance

# Example usage
directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed_remove_duplicate_rows'  # Replace this with the path to your directory
cumulative_distance = calculate_cumulative_distance(directory)
print(f"Cumulative Distance Traversed: {cumulative_distance} km")

In [None]:
import os
import pandas as pd

# Specify the directory containing the CSV files
directory = 'C:\\Users\\ss6365\\Desktop\\Datasets\\Collected\\dataset_preprocessed'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an identifier starting from 1
identifier = 1

# Iterate over each CSV file
for file_name in csv_files:
    # Construct the full file path
    file_path = os.path.join(directory, file_name)
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Add a new column 'identifier' with all rows having the same identifier
    df['identifier'] = identifier
    
    # Save the modified DataFrame back to the same CSV file
    df.to_csv(file_path, index=False)
    
    # Increment the identifier for the next file
    identifier += 1

print('All files have been processed and updated with an identifier.')


In [None]:
################# Data Security ####################

In [12]:
import os
import pandas as pd


# Input directory containing CSV files
input_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\collected\\data\\utility'

# Output directory where the merged CSV file will be saved
output_directory = 'C:\\Users\\ss6365\\Desktop\\location_privacy_final\\collected\\data'



# # List of important columns to keep
important_columns = ['identifier', 'longitude', 'latitude']  # Replace with your column names

# # Create a list to store dataframes from individual CSV files
dataframes = []

# # Iterate through CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        df = pd.read_csv(file_path, usecols=important_columns)
        dataframes.append(df)

# # Concatenate dataframes vertically (along rows)
merged_df = pd.concat(dataframes, ignore_index=True)

# # Output file path for the merged CSV file
output_file_path = os.path.join(output_directory, 'merged_all_utility_subset.csv')

# # Save the merged dataframe to a CSV file
merged_df.to_csv(output_file_path, index=False)

print(f"Merged CSV file saved to {output_file_path}")

Merged CSV file saved to C:\Users\ss6365\Desktop\location_privacy_final\collected\data\merged_all_utility_subset.csv


In [13]:
import numpy as np


df = pd.read_csv(r'C:\Users\ss6365\Desktop\location_privacy_final\collected\data\merged_all_utility_subset.csv')

def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in km
    R = 6371.0
    # Convert coordinates from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def find_square_boundaries(lat, lon, distance_km):
    # Approximate conversions
    delta_lat = distance_km / 111  # 111 km per degree of latitude
    delta_lon = distance_km / (111 * np.cos(np.radians(lat)))  # Adjust for longitude
    return lat - delta_lat, lat + delta_lat, lon - delta_lon, lon + delta_lon


# Calculate the median (or mean) latitude and longitude
central_lat = df['latitude'].median()
central_lon = df['longitude'].median()


# Define the square region boundaries
lat_min, lat_max, lon_min, lon_max = find_square_boundaries(central_lat, central_lon, 2)

# Filter the DataFrame for points within the 1 km square
df_limit = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
               (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

df_limit

Unnamed: 0,longitude,latitude,identifier
0,-77.680333,43.083838,1
1,-77.680991,43.083803,1
2,-77.681017,43.083802,1
3,-77.681042,43.083802,1
4,-77.681090,43.083802,1
...,...,...,...
99218,-77.680445,43.083879,46
99219,-77.680442,43.083868,46
99220,-77.680441,43.083863,46
99221,-77.680442,43.083860,46


In [14]:
import glob
import os
import pandas as pd

# Input and output directories
input_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\collected\data\utility'
output_directory = r'C:\Users\ss6365\Desktop\location_privacy_final\collected\data\security'


# Calculate the boundaries based on the current file
lat_min = df_limit['latitude'].min()
lat_max = df_limit['latitude'].max()
lon_min = df_limit['longitude'].min()
lon_max = df_limit['longitude'].max()


# Iterate through CSV files in the input directory
for csv_file in glob.glob(os.path.join(input_directory, '*.csv')):
    # Load the CSV file
    df = pd.read_csv(csv_file)



    # Distance parameter (can be adjusted as needed)
    distance_km = 2

    # Define the square region boundaries and filter the DataFrame
    df_square = df[(df['latitude'] >= lat_min) & (df['latitude'] <= lat_max) &
                   (df['longitude'] >= lon_min) & (df['longitude'] <= lon_max)]

    # Check if the filtered DataFrame is empty (no data within boundaries)
    if df_square.empty:
        continue  # Skip saving if no data matches the criteria

    # Extract the base filename without extension
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    
    # Create the new filename with distance_km
    new_filename = f"{base_filename}_{distance_km}km.csv"
    
    # Save the filtered DataFrame to the output directory with the new filename
    output_path = os.path.join(output_directory, new_filename)
    df_square.to_csv(output_path, index=False)

print("Processing complete.")

Processing complete.
