In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

# Loading the Dataset

In [2]:
df = pd.read_csv('oasis_longitudinal.csv')
df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [3]:
# metadata 

'''

1) Subject ID - ID of the individual subject
2) MRI ID - ID of the MRI scan
3) Group - Demented / Non Demented i.e. variable to be predicted
4) Visit - Count of visit of the subject
5) MR Delay - Instrument error
6) M/F - Gender
7) Hand - Dominant Hand
8) Age - Age of Subject
9) EDUC - Educational level of subject [years of education]
10) SES - Socio Economic Status
11) MMSE - Mini Mental State Exam - An all round test of the brain which can yield a max score of 30 being the healthiest 
12) CDR - Clinical Dementia Rating
13) eTIV - Estimated Total IntraCranial Volume
14) nWBV - Normalized Whole Brain Volume
15) ASF - Atlas Scaling Factor



'''

'\n\n1) Subject ID - ID of the individual subject\n2) MRI ID - ID of the MRI scan\n3) Group - Demented / Non Demented i.e. variable to be predicted\n4) Visit - Count of visit of the subject\n5) MR Delay - Instrument error\n6) M/F - Gender\n7) Hand - Dominant Hand\n8) Age - Age of Subject\n9) EDUC - Educational level of subject [years of education]\n10) SES - Socio Economic Status\n11) MMSE - Mini Mental State Exam - An all round test of the brain which can yield a max score of 30 being the healthiest \n12) CDR - Clinical Dementia Rating\n13) eTIV - Estimated Total IntraCranial Volume\n14) nWBV - Normalized Whole Brain Volume\n15) ASF - Atlas Scaling Factor\n\n\n\n'

# Data Cleaning

In [11]:
print("\n Data Types of Columns")
df.dtypes


 Data Types of Columns


Subject ID     object
MRI ID         object
Group          object
Visit           int64
MR Delay        int64
M/F            object
Hand           object
Age             int64
EDUC            int64
SES           float64
MMSE          float64
CDR           float64
eTIV            int64
nWBV          float64
ASF           float64
dtype: object

In [12]:
print("Shape of Data")
df.shape

Shape of Data


(373, 15)

In [16]:
col = df.columns

print("\nNull Values in Each column\n")

for i in col:
    print(i , " = ", df[i].isnull().sum())


Null Values in Each column

Subject ID  =  0
MRI ID  =  0
Group  =  0
Visit  =  0
MR Delay  =  0
M/F  =  0
Hand  =  0
Age  =  0
EDUC  =  0
SES  =  19
MMSE  =  2
CDR  =  0
eTIV  =  0
nWBV  =  0
ASF  =  0


In [30]:
# fill null values with median as it is more robust
med_SES = df['SES'].median(skipna = True)
med_SES 


2.0

In [31]:
df['SES'].fillna(med_SES , inplace = True)

In [32]:
# no more null values in SES remain
df['SES'].isnull().sum()

0

In [33]:
# Similarly we clean the MMSE column too
med_MMSE = df['MMSE'].median(skipna=True)
df['MMSE'].fillna(med_MMSE , inplace=True)

In [34]:
# no more null values in MMSE remain
df['MMSE'].isnull().sum()

0

In [None]:
# Our data is now ready for visualization and analysis 