In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = "Example/Data/PDFEinfo.csv"
df = pd.read_csv(data_path, encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,"ID;Age;Weight (kg);Height (m);Gender;Disease duration (years);Handedness;More affected side;L-Dopa equivalent units (mgday-1);sessions #;""Session 1 -"
0,"Mini-Mental (score)"";""Session 1 -"
1,"NFoG-Q (score)"";Session 1 - H&Y (score);Sessio..."
2,"TUG dual-task time (s)"";Session 1 - time of Fo..."
3,"Mini-Mental (score)"";""Session 2 -"
4,"NFoG-Q (score)"";Session 2 - H&Y (score);Sessio..."


### Cleaning the .csv file

We notice that this .csv file raw is very difficult to read and visualize. We will need to remove the semicolons and replace them with commas so that pandas can utilize the data more easily.

What we can do instead is actually, have pandas separate by semicolon, rather than its default commas with the code below

In [4]:
df = pd.read_csv(data_path, encoding = 'latin-1', sep=';' )

df.head()

Unnamed: 0,ID,Age,Weight (kg),Height (m),Gender,Disease duration (years),Handedness,More affected side,L-Dopa equivalent units (mgday-1),sessions #,...,Session 3 - total time in FoG (s),Session 3 - numbers of FoG episodes (n),Session 3 - FoG ratio (score),PDQ-39 (score),MoCA (score),FAB (score),DSST (score),STROOP-I time (s),STROOP-II time (s),STROOP-III time (s)
0,PDFE01,56.0,67.0,1.5,F,6.0,Right,Right,800.0,2.0,...,-,-,-,49.4,23,16,45,18.3,19.3,28.0
1,PDFE02,78.0,60.0,1.65,M,7.0,Right,Left,300.0,1.0,...,-,-,-,28.8,0,11,25,22.3,33.7,37.4
2,PDFE03,70.0,78.0,1.75,M,7.0,Right,Right,625.0,3.0,...,1.44,1,11.73,37.2,12,9,1,42.1,48.8,60.9
3,PDFE04,66.0,83.0,1.89,M,10.0,Right,Left,700.0,3.0,...,-,-,-,25.0,12,13,4,32.1,49.3,131.2
4,PDFE05,62.0,78.0,1.79,M,4.0,Right,Left,1100.0,2.0,...,-,-,-,27.6,21,16,36,15.9,23.5,29.9


In [5]:
# Let us analyze the data frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 71 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ID                                       35 non-null     object 
 1   Age                                      35 non-null     float64
 2   Weight (kg)                              35 non-null     float64
 3   Height (m)                               35 non-null     float64
 4   Gender                                   35 non-null     object 
 5   Disease duration (years)                 35 non-null     float64
 6   Handedness                               35 non-null     object 
 7   More affected side                       35 non-null     object 
 8   L-Dopa equivalent units (mgday-1)       35 non-null     float64
 9   sessions #                               35 non-null     float64
 10  Session 1 - 
Mini-Mental (score)         35 non-null

In [6]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,ID,Age,Weight (kg),Height (m),Gender,Disease duration (years),Handedness,More affected side,L-Dopa equivalent units (mgday-1),sessions #,Session 1 - \nMini-Mental (score),Session 1 - \nNFoG-Q (score),Session 1 - H&Y (score),Session 1 - UPDRS-II (score),Session 1 - UPDRS-III (score),Session 1 - PIGD (score),Session 1 - Dyskinesia (score),Session 1 - HADS (score),Session 1 - HADS-A (score),Session 1 - HADS-D (score),Session 1 - FES-I (score),Session 1 - MiniBestTest (score),Session 1 - TUG time (s),Session 1 - \nTUG dual-task time (s),Session 1 - time of FoG (s),Session 1 - total time in FoG (s),Session 1 - numbers of FoG episodes (n),Session 1 - FoG ratio (score),Session 2 - \nMini-Mental (score),Session 2 - \nNFoG-Q (score),Session 2 - H&Y (score),Session 2 - UPDRS-II (score),Session 2 - UPDRS-III (score),Session 2 - PIGD (score),Session 2 - Dyskinesia (score),Session 2 - HADS (score),Session 2 - HADS-A (score),Session 2 - HADS-D (score),Session 2 - FES-I (score),Session 2 - MiniBestTest (score),Session 2 - TUG time (s),Session 2 - \nTUG dual-task time (s),Session 2 - time of FoG (s),Session 2 - total time in FoG (s),Session 2 - numbers of FoG episodes (n),Session 2 - FoG ratio (score),Session 3 - \nMini-Mental (score),Session 3 - \nNFoG-Q (score),Session 3 - H&Y (score),Session 3 - UPDRS-II (score),Session 3 - UPDRS-III (score),Session 3 - PIGD (score),Session 3 - Dyskinesia (score),Session 3 - HADS (score),Session 3 - HADS-A (score),Session 3 - HADS-D (score),Session 3 - FES-I (score),Session 3 - MiniBestTest (score),Session 3 - TUG time (s),Session 3 - \nTUG dual-task time (s),Session 3 - time of FoG (s),Session 3 - total time in FoG (s),Session 3 - numbers of FoG episodes (n),Session 3 - FoG ratio (score),PDQ-39 (score),MoCA (score),FAB (score),DSST (score),STROOP-I time (s),STROOP-II time (s),STROOP-III time (s)
0,PDFE01,56.0,67.0,1.5,F,6.0,Right,Right,800.0,2.0,25.0,20.0,3.0,6.0,16.0,6.0,0.0,14.0,12.0,2.0,35.0,17.0,16.73,28.43,[1.383-35.768; 36.696-65.969; 67.328-105.162; ...,115.1,4.0,43.34,29,23,3,6,14,7,0,15,10,5,37,19,10.6,18.5,[0-46.502; 50.549-120],115.96,2,17.94,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,49.4,23,16,45,18.3,19.3,28
1,PDFE02,78.0,60.0,1.65,M,7.0,Right,Left,300.0,1.0,23.0,19.0,3.0,8.0,40.0,10.0,0.0,16.0,9.0,7.0,52.0,7.0,10.25,11.5,[0.024-44.578; 53.817-98.191; 99.232-110.814; ...,107.59,4.0,44.84,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,28.8,0,11,25,22.3,33.7,37.4
2,PDFE03,70.0,78.0,1.75,M,7.0,Right,Right,625.0,3.0,23.0,10.0,3.0,7.0,27.0,7.0,0.0,13.0,6.0,7.0,44.0,20.0,11.2,14.07,[35.035-36.286],1.27,1.0,8.15,25,7,3,6,23,4,0,11,4,7,40,23,9.8,11.1,0,0.00,0,6.84,29,7,3,5,21,4,0,14,6,8,38,22,12.00,14.20,[58.216-59.646],1.44,1,11.73,37.2,12,9,1,42.1,48.8,60.9
3,PDFE04,66.0,83.0,1.89,M,10.0,Right,Left,700.0,3.0,27.0,1.0,2.0,4.0,27.0,1.0,3.0,8.0,5.0,3.0,24.0,25.0,9.03,9.4,[34.613-35.744; 48.299-49.882; 58.567-59.890; ...,17.02,8.0,15.13,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,25.0,12,13,4,32.1,49.3,131.2
4,PDFE05,62.0,78.0,1.79,M,4.0,Right,Left,1100.0,2.0,26.0,12.0,2.0,4.0,12.0,4.0,1.0,21.0,10.0,11.0,25.0,23.0,8.53,9.2,[55.637-58.816; 84.866-85.431; 88.892-90.623; ...,11.87,5.0,6.63,29,10,2,4,12,2,2,22,10,12,27,24,8.8,9.1,[37.609-9.659; 53.753-54.65; 56.187-58.492; 89...,11.52,5,6.11,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,27.6,21,16,36,15.9,23.5,29.9
5,PDFE06,67.0,78.0,1.54,F,1.0,Right,Right,225.0,2.0,30.0,5.0,3.0,7.0,17.0,4.0,0.0,17.0,10.0,7.0,47.0,20.0,8.75,10.65,[112.224-114.486],2.27,1.0,0.65,30,3,3,9,19,3,0,10,5,5,46,24,8.7,10.8,0,0.00,0,1.02,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
6,PDFE07,77.0,57.0,1.65,M,7.0,Right,Right,1150.0,3.0,22.0,28.0,3.0,8.0,48.0,11.0,0.0,17.0,15.0,2.0,23.0,12.0,20.19,60.31,[0-120],120.0,1.0,7.22,24,27,3,8,41,10,0,16,14,2,16,20,19.9,54.42,[0.332-61.209; 65.504-120],115.39,2,12.49,28,27,3,8,36,10,0,18,15,3,19,18,38.23,55.20,[0-120],120.00,1,10.60,48.1,23,10,8,76.3,95.7,113.4
7,PDFE08,84.0,67.0,1.7,F,18.0,Right,Left,1100.0,1.0,24.0,22.0,4.0,14.0,41.0,11.0,0.0,23.0,12.0,11.0,47.0,6.0,23.47,32.63,[3.666-25.872; 30.15-32.346; 34.816-41.713; 50...,81.83,7.0,20.99,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,69.2,12,9,0,44.7,71.4,164
8,PDFE09,53.0,95.0,1.8,M,9.0,Right,Left,500.0,2.0,28.0,10.0,2.0,1.0,29.0,1.0,0.0,7.0,4.0,3.0,24.0,26.0,7.44,8.87,0,0.0,0.0,1.29,30,7,2,2,25,1,0,5,3,2,20,28,7.3,8.2,[24.635-26.982; 53.528-65.385; 89.455-93.765; ...,20.85,4,3.45,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,36.5,23,10,25,45.6,23.5,47.2
9,PDFE10,55.0,88.0,1.65,M,2.0,Right,Left,375.0,2.0,28.0,14.0,3.0,7.0,19.0,4.0,0.0,9.0,5.0,4.0,44.0,23.0,9.86,9.39,0,0.0,0.0,0.59,28,13,3,7,20,3,0,5,3,2,28,28,8.3,8.6,0,0.00,0,0.62,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


In [7]:
# Let us convert the .csv into an xlsx file so that we can better view the data for similarities
excel_path = "Example\Data\PDFEinfo_formatted.xlsx"
df.to_excel(excel_path, index = False, engine = 'openpyxl')

In [8]:
# Let us consider if we do not want to take certain columns/rows into account when analyzing the dataframe. We can remove those columns/rows using the code below as an example

In [9]:
df_filtered = df.drop(columns = ['ID', 'Age'])