In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [22]:
df = pd.read_csv('oasis_cross-sectional.csv')
df.head()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
3,OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,
4,OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,


# Columns:
- ID: Unique identifier for each entry.
- M/F: Gender of the individual (M for male, F for female).
- Hand: Dominant hand of the individual (R for right, L for left).
- Age: Age of the individual.
- Educ: Years of education.
- SES: Socioeconomic status .
- MMSE: Mini-Mental State Examination score, a measure of cognitive function.
- CDR: Clinical Dementia Rating .
- eTIV: Estimated total intracranial volume.
- nWBV: Normalized whole-brain volume.
- ASF: Atlas scaling factor.
- Delay: Delay in years.

## Handling the Missing Values

In [23]:
missing_values = df.isna().sum()
print(missing_values)

ID         0
M/F        0
Hand       0
Age        0
Educ     201
SES      220
MMSE     201
CDR      201
eTIV       0
nWBV       0
ASF        0
Delay    416
dtype: int64


In [24]:
# Handle missing values
# Replacing the missing values will be replaced with the mean value of the column

imputer = SimpleImputer(strategy='mean')
df['Educ'] = imputer.fit_transform(df[['Educ']])
df['SES'] = imputer.fit_transform(df[['SES']])
df['MMSE'] = imputer.fit_transform(df[['MMSE']])
df['CDR'] = imputer.fit_transform(df[['CDR']])
df['Delay'] = imputer.fit_transform(df[['Delay']])

## Encoding the categorical variables

In [25]:
# Encoding categorical variables
label_encoder = LabelEncoder()
df['M/F'] = label_encoder.fit_transform(df['M/F'])
df['Hand'] = label_encoder.fit_transform(df['Hand'])

In [26]:
df.head()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,0,0,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,20.55
1,OAS1_0002_MR1,0,0,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,20.55
2,OAS1_0003_MR1,0,0,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,20.55
3,OAS1_0004_MR1,1,0,28,3.178723,2.490741,27.06383,0.285106,1588,0.803,1.105,20.55
4,OAS1_0005_MR1,1,0,18,3.178723,2.490741,27.06383,0.285106,1737,0.848,1.01,20.55


## Defining Category based on CDR

In [27]:
# Defining the ranges
bins = [0, 0.5, 0.75, 1, np.inf]
labels = ['0-0.5', '0.5-0.75', '0.75-1', 'Above 1']

In [28]:
# Adding the column
def categorize_dementia(cdr_value):
    if cdr_value < 0.5:
        return 'No Dementia'
    elif 0.5 <= cdr_value < 0.75:
        return 'Mild Dementia'
    elif 0.75 <= cdr_value < 1:
        return 'Moderate Dementia'
    else:
        return 'Severe Dementia'

df['dementia'] = df['CDR'].apply(categorize_dementia)

In [37]:
df

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay,Dementia_Category
0,OAS1_0001_MR1,0,0,74,2.000000,3.000000,29.000000,0.000000,1344,0.743000,1.306000,20.550000,No Dementia
1,OAS1_0002_MR1,0,0,55,4.000000,1.000000,29.000000,0.000000,1147,0.810000,1.531000,20.550000,No Dementia
2,OAS1_0003_MR1,0,0,73,4.000000,3.000000,27.000000,0.500000,1454,0.708000,1.207000,20.550000,Mild Dementia
3,OAS1_0004_MR1,1,0,28,3.178723,2.490741,27.063830,0.285106,1588,0.803000,1.105000,20.550000,No Dementia
4,OAS1_0005_MR1,1,0,18,3.178723,2.490741,27.063830,0.285106,1737,0.848000,1.010000,20.550000,No Dementia
...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,OAS1_0997_MR1,0,0,47,2.000000,2.990049,27.036311,1.648629,1607,0.864848,1.469493,43.026837,Severe Dementia
997,OAS1_0998_MR1,1,0,39,5.000000,2.863306,17.855143,1.730793,1214,0.670942,1.223898,11.752060,Severe Dementia
998,OAS1_0999_MR1,0,0,65,2.000000,2.573398,22.168675,0.848167,1525,0.647972,0.917636,85.404965,Moderate Dementia
999,OAS1_1000_MR1,1,0,32,3.000000,1.614035,27.041336,0.181308,1938,0.676139,1.389838,60.035106,No Dementia


In [36]:
# Counting the number of values in each dementia category
dementia_category_counts = df['Dementia_Category'].value_counts()
print(dementia_category_counts)

Dementia_Category
No Dementia          493
Severe Dementia      306
Mild Dementia        137
Moderate Dementia     65
Name: count, dtype: int64


In [None]:
# Encoding target
label_encoder = LabelEncoder()
df['Dementia_Category'] = label_encoder.fit_transform(df['Dementia_Category'])

## Target Column

1. No Dementia (CDR < 0.5)
2. Mild Dementia (0.5 <= CDR < 0.75)
3. Moderate Dementia (0.75 <= CDR < 1)
4. Severe Dementia (CDR >= 1)

## Count:
- No Dementia          493
- Severe Dementia      306
- Mild Dementia        137
- Moderate Dementia     65

In [39]:
df

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay,Dementia_Category
0,OAS1_0001_MR1,0,0,74,2.000000,3.000000,29.000000,0.000000,1344,0.743000,1.306000,20.550000,2
1,OAS1_0002_MR1,0,0,55,4.000000,1.000000,29.000000,0.000000,1147,0.810000,1.531000,20.550000,2
2,OAS1_0003_MR1,0,0,73,4.000000,3.000000,27.000000,0.500000,1454,0.708000,1.207000,20.550000,0
3,OAS1_0004_MR1,1,0,28,3.178723,2.490741,27.063830,0.285106,1588,0.803000,1.105000,20.550000,2
4,OAS1_0005_MR1,1,0,18,3.178723,2.490741,27.063830,0.285106,1737,0.848000,1.010000,20.550000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,OAS1_0997_MR1,0,0,47,2.000000,2.990049,27.036311,1.648629,1607,0.864848,1.469493,43.026837,3
997,OAS1_0998_MR1,1,0,39,5.000000,2.863306,17.855143,1.730793,1214,0.670942,1.223898,11.752060,3
998,OAS1_0999_MR1,0,0,65,2.000000,2.573398,22.168675,0.848167,1525,0.647972,0.917636,85.404965,1
999,OAS1_1000_MR1,1,0,32,3.000000,1.614035,27.041336,0.181308,1938,0.676139,1.389838,60.035106,2


In [40]:
df.to_excel('FINAL.xlsx', index=False)