# Search Duplicates in CSV 
This function identifies and returns the rows with duplicate values in a specified column from a pandas DataFrame. It marks all duplicates and shows the row indices where those duplicates occur, helping to pinpoint where the repeated values are found.

In [None]:
import pandas as pd

# Function to load the first X rows from the CSV file
def load_csv_with_limit(file_path, num_rows):
    print("Opening file...")
    return pd.read_csv(file_path, nrows=num_rows)  # Only load the first X rows

def find_duplicates(column_name, data):
    if column_name not in data.columns:
        print(f"Column '{column_name}' does not exist.")
        return

    # Group rows by column values and find duplicates
    duplicate_groups = data[data[column_name].duplicated(keep=False)].groupby(column_name).apply(lambda x: list(x.index))
    
    if duplicate_groups.empty:
        print(f"No duplicates found in column '{column_name}'.")
    else:
        print(f"Duplicates found in column '{column_name}':")
        for value, rows in duplicate_groups.items():
            print(f"Found duplicated value in rows: {rows}")

# Example usage
file_path = 'hackacity_wifi_data.csv'  # Replace with your actual file path
num_rows_to_check = 100  # Replace with the number of rows to load
data = load_csv_with_limit(file_path, num_rows_to_check)

column_to_check = 'callingstationid'  # Replace with the column you want to check
find_duplicates(column_to_check, data)


# Deductions:
Analyzing the output of this function, we determined that users' MAC addresses share the same session ID.
Additionally, the MAC addresses are only hashed without any added noise.
A possible improvement would be to introduce noise (e.g., a salt) before or after the hashing process, creating truly unique IDs and enhancing security by preventing the use of rainbow tables or making the hashes predictable.
