In [6]:
import csv

def replace_values(filename):
    new_filename = filename.replace('.csv', '_modified.csv')

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        data = list(reader)

    unique_values = set()
    filtered_data = []
    fieldnames = ['PATNO', 'COHORT', 'TESTNAME', 'TESTVALUE']
    
    for row in data:
        if row['COHORT'] == 'Control':
            row['COHORT'] = '0'
        elif row['COHORT'] == 'PD':
            row['COHORT'] = '1'
        
        unique_values.add(row['PATNO'])
        
        if row['COHORT'] == '0' or row['COHORT'] == '1':
            filtered_row = {field: row[field] for field in fieldnames}
            filtered_data.append(filtered_row)

    with open(new_filename, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(filtered_data)

    print(f"Modified CSV file saved as {new_filename}")
    print(f"Number of unique values in PATNO column: {len(unique_values)}")

# Replace 'Test.csv' with the actual path to your CSV file
replace_values('PPMI_4.csv')


Modified CSV file saved as PPMI_3_modified.csv
Number of unique values in PATNO column: 200


In [4]:
import csv

def restructure_data(filename):
    new_filename = filename.replace('_modified.csv', '_restructured.csv')

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        data = list(reader)

    patients = {}  # Dictionary to store patient data
    testnames = set()  # Set to store unique testnames

    # Iterate over each row to collect data
    for row in data:
        patno = row['PATNO']
        cohort = row['COHORT']
        testname = row['TESTNAME']
        testvalue = row['TESTVALUE']

        # Add testname to the set of unique testnames
        testnames.add(testname)

        # Check if the patient exists in the dictionary
        if patno in patients:
            # Add testvalue for the existing patient and testname
            patients[patno][testname] = testvalue
        else:
            # Create a new entry for the patient
            patients[patno] = {
                'PD': cohort,
                testname: testvalue
            }

    # Create a sorted list of unique testnames
    sorted_testnames = sorted(testnames)

    # Write the restructured data to the new CSV file
    with open(new_filename, 'w', newline='') as file:
        fieldnames = ['Patient', 'PD'] + sorted_testnames
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # Iterate over patient data and write to the CSV file
        for patno, patient_data in patients.items():
            row = {'Patient': patno, 'PD': patient_data['PD']}
            row.update(patient_data)
            writer.writerow(row)

    print(f"Restructured CSV file saved as {new_filename}")

# Replace 'PPMI_4_modified.csv' with the actual path to your modified CSV file
restructure_data('PPMI_1_modified.csv')


Restructured CSV file saved as PPMI_1_restructured.csv


In [8]:
import csv
from collections import Counter

def find_duplicates(filename):
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        first_column_values = [row[0] for row in reader]

    duplicate_counter = Counter(first_column_values)
    duplicates = [value for value, count in duplicate_counter.items() if count > 1]

    locations = {}
    for value in duplicates:
        indices = [index for index, val in enumerate(first_column_values) if val == value]
        locations[value] = indices

    total_duplicates = sum(duplicate_counter[value] - 1 for value in duplicates)

    if duplicates:
        print("Duplicate values found in the first column:")
        for value in duplicates:
            print(f"{value}: {duplicate_counter[value]} occurrences")
            print("Locations: ", locations[value])
        print(f"Total duplicates: {total_duplicates}")
    else:
        print("No duplicate values found in the first column.")

# Replace 'your_file.csv' with the actual path to your CSV file
find_duplicates('PPMI_All.csv')


Duplicate values found in the first column:
3363: 2 occurrences
Locations:  [0, 416]
3361: 2 occurrences
Locations:  [1, 417]
3108: 2 occurrences
Locations:  [2, 418]
3111: 2 occurrences
Locations:  [3, 419]
3023: 3 occurrences
Locations:  [4, 403, 420]
3467: 2 occurrences
Locations:  [5, 421]
3515: 2 occurrences
Locations:  [6, 422]
3967: 2 occurrences
Locations:  [7, 423]
3314: 2 occurrences
Locations:  [8, 424]
4071: 2 occurrences
Locations:  [9, 425]
3284: 2 occurrences
Locations:  [10, 426]
3029: 2 occurrences
Locations:  [11, 427]
3221: 2 occurrences
Locations:  [12, 428]
3309: 3 occurrences
Locations:  [13, 195, 429]
3540: 4 occurrences
Locations:  [14, 265, 353, 430]
70818: 3 occurrences
Locations:  [15, 339, 431]
55395: 4 occurrences
Locations:  [16, 247, 335, 432]
58550: 2 occurrences
Locations:  [17, 433]
75421: 2 occurrences
Locations:  [18, 434]
3173: 4 occurrences
Locations:  [19, 274, 382, 435]
3316: 2 occurrences
Locations:  [20, 436]
4096: 2 occurrences
Locations:  [21

In [None]:
import csv
from collections import Counter

def create_unique_file(filename):
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Store the header row
        first_column_values = [row[0] for row in reader]

    duplicate_counter = Counter(first_column_values)
    duplicates = [value for value, count in duplicate_counter.items() if count > 1]

    unique_rows = []
    for value in duplicates:
        indices = [index for index, val in enumerate(first_column_values) if val == value]
        unique_rows.append(indices[0])  # Append the first occurrence of each duplicate value

    unique_rows.extend([index for index, value in enumerate(first_column_values) if value not in duplicates])

    unique_data = []
    with open('unique_file.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header row

        with open(filename, 'r') as file:
            reader = csv.reader(file)
            for index, row in enumerate(reader):
                if index in unique_rows:
                    writer.writerow(row)  # Write the unique rows to the new file
                    unique_data.append(row)

    print("Unique file 'unique_file.csv' created with only one occurrence of each duplicate value.")

    return unique_data

# Replace 'your_file.csv' with the actual path to your CSV file
unique_data = create_unique_file('PPMI_All.csv')
