In [1]:
import os
import pandas as pd

def process_files(input_directory, output_directory):
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate over all files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".csv"):
            # Construct the full file path
            file_path = os.path.join(input_directory, filename)
            
            # Load the CSV file
            data = pd.read_csv(file_path)
            
            # Calculate descriptive statistics and drop 'count'
            descriptive_stats = data.describe().drop('count')
            
            # Flatten the DataFrame to a single row per feature set
            descriptive_stats_flat = descriptive_stats.unstack().to_frame().T
            
            # Create new column names by combining the statistic with the feature name
            descriptive_stats_flat.columns = ['{}_{}'.format(stat, feature) for feature, stat in descriptive_stats_flat.columns]
            
            # Construct the output file path
            base_filename = os.path.splitext(filename)[0]
            output_filename = f"{base_filename}_stats.csv"
            output_filepath = os.path.join(output_directory, output_filename)
            
            # Save the flattened descriptive statistics to CSV
            descriptive_stats_flat.to_csv(output_filepath, index=False)
            
            print(f"Processed and saved: {output_filename}")

# Define the input and output directories
input_directory = 'D:\\Parkinson\'s\\Feature extraction\\Features_Extraction_5mins_overlap'
output_directory = 'D:\\Parkinson\'s\\Feature extraction\\Features_Extraction_5mins_overlap\\Stats'

# Process all files
process_files(input_directory, output_directory)


Processed and saved: 12593_03182019_TSFEL_features_stats.csv
Processed and saved: 13039_10012019_TSFEL_features_stats.csv
Processed and saved: 14281_02192019_TSFEL_features_stats.csv
Processed and saved: 14331_06052019_TSFEL_features_stats.csv
Processed and saved: 18567_03202019_TSFEL_features_stats.csv
Processed and saved: 3002_03152019_TSFEL_features_stats.csv
Processed and saved: 3003_03292019_TSFEL_features_stats.csv
Processed and saved: 3004_03292019_TSFEL_features_stats.csv
Processed and saved: 3013_12112019_TSFEL_features_stats.csv
Processed and saved: 3020_04122019_TSFEL_features_stats.csv
Processed and saved: 3023_06062019_TSFEL_features_stats.csv
Processed and saved: 3028_03122019_TSFEL_features_stats.csv
Processed and saved: 3029_05312019_TSFEL_features_stats.csv
Processed and saved: 3050_03052019_TSFEL_features_stats.csv
Processed and saved: 3055_12182019_TSFEL_features_stats.csv
Processed and saved: 3057_02042019_TSFEL_features_stats.csv
Processed and saved: 3061_04012019_

In [2]:
import os
import pandas as pd

def combine_files_with_patient_number(input_directory, output_filename):
    combined_df = pd.DataFrame()  # Initialize an empty DataFrame to hold combined data
    
    # Iterate over all files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".csv"):  # Ensure we're only processing CSV files
            file_path = os.path.join(input_directory, filename)
            # Extract the patient number from the filename
            patient_number = filename.split('_')[0]
            # Read the current file into a DataFrame
            temp_df = pd.read_csv(file_path)
            # Add the patient number as a new column
            temp_df['PatientNumber'] = patient_number
            # Append the data from the current file to the combined DataFrame
            combined_df = pd.concat([combined_df, temp_df], axis=0, ignore_index=True)
    
    # Save the combined DataFrame to an Excel file
    combined_df.to_excel(output_filename, index=False)
    print(f"Combined file with patient numbers saved as: {output_filename}")

# Define the input directory and output filename
input_directory = 'D:\\Parkinson\'s\\Feature extraction\\Features_Extraction_5mins_overlap\\Stats'
output_filename = 'D:\\Parkinson\'s\\Feature extraction\\Features_Extraction_5mins_overlap\\TSFEL_features_5mins_overlap_combine.xlsx'


# Combine all files and include patient numbers
combine_files_with_patient_number(input_directory, output_filename)



Combined file with patient numbers saved as: D:\Parkinson's\Feature extraction\Features_Extraction_5mins_overlap\TSFEL_features_5mins_overlap_combine.xlsx


In [3]:
import pandas as pd

# Load the combined TSFEL features Excel file
combined_path = 'D:\\Parkinson\'s\\Feature extraction\\Features_Extraction_5mins_overlap\\TSFEL_features_5mins_overlap_combine.xlsx'
combined_df = pd.read_excel(combined_path)

# Load the additional data Excel file and the specific sheet
additional_data_path = 'D:\\Data Count (2).xlsx'
additional_df = pd.read_excel(additional_data_path, sheet_name='2019 Data Match')

# We're interested in the following columns from the additional data
additional_columns = ['PATNO', 'MOCA(2019)', 'Age', 'Sex', 'EDUCATION YEARS', 'Parkinson\'s Vs Control']

# Filter the additional data DataFrame to only include these columns
additional_df_filtered = additional_df[additional_columns]

# Convert PATNO to string if it's not already, to ensure matching works correctly
additional_df_filtered['PATNO'] = additional_df_filtered['PATNO'].astype(str)
combined_df['PatientNumber'] = combined_df['PatientNumber'].astype(str)

# Merge the combined DataFrame with the additional information
# This assumes 'PatientNumber' in the combined_df matches 'PATNO' in the additional_df_filtered
combined_updated_df = pd.merge(combined_df, additional_df_filtered, left_on='PatientNumber', right_on='PATNO', how='left')

# Drop the PATNO column after the merge, as it's redundant
combined_updated_df.drop('PATNO', axis=1, inplace=True)

# Save the updated combined DataFrame back to Excel
combined_updated_df.to_excel(combined_path, index=False)

print(f"Combined file has been updated and saved back to: {combined_path}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  additional_df_filtered['PATNO'] = additional_df_filtered['PATNO'].astype(str)


Combined file has been updated and saved back to: D:\Parkinson's\Feature extraction\Features_Extraction_5mins_overlap\TSFEL_features_5mins_overlap_combine.xlsx
