#1. Loading and Initial Inspection


In [47]:
import pandas as pd
import numpy as np

file_path = 'Frailty_Raw_Dataset.csv'
data = pd.read_csv(file_path)

data

Unnamed: 0,Height (Inches),Weight (Pounds),Age,Grip strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


Display the first few rows of the dataset to understand its structure.

Check the basic information about the dataset.

Display summary statistics for numerical columns.

Check for missing values in the dataset

In [48]:
data.head()
data.info()
data.describe()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Height (Inches)   10 non-null     float64
 1   Weight (Pounds)   10 non-null     int64  
 2   Age               10 non-null     int64  
 3   Grip strength     10 non-null     int64  
 4   Frailty           10 non-null     object 
dtypes: float64(1), int64(3), object(1)
memory usage: 528.0+ bytes


Unnamed: 0,0
Height (Inches),0
Weight (Pounds),0
Age,0
Grip strength,0
Frailty,0


#2. Handling Missing Values

Drop rows with missing values

In [49]:
data_cleaned = data.dropna()


Impute missing values with median for numerical and mode for categorical columns.

Impute numerical columns with median.

Impute categorical columns with mode.

In [50]:
numerical_cols = data.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode_value = data[col].mode()[0]
    data[col].fillna(mode_value, inplace=True)

data.isnull().sum()

Unnamed: 0,0
Height (Inches),0
Weight (Pounds),0
Age,0
Grip strength,0
Frailty,0


#3. Handling Categorical Variables.

Convert 'Frailty' from 'N'/'Y' to 'No'/'Yes'

In [51]:
data['frailty'] = data['Frailty'].replace({'N': 'No', 'Y': 'Yes'})
data['frailty_binary'] = data['frailty'].map({'No': 0, 'Yes': 1})
data.head()

Unnamed: 0,Height (Inches),Weight (Pounds),Age,Grip strength,Frailty,frailty,frailty_binary
0,65.8,112,30,30,N,No,0
1,71.5,136,19,31,N,No,0
2,69.4,153,45,29,N,No,0
3,68.2,142,22,28,Y,Yes,1
4,67.8,144,29,24,Y,Yes,1


#4. Removing Duplicates

In [52]:
duplicates = data.duplicated()
duplicates.sum()

data = data.drop_duplicates()

data.head()

Unnamed: 0,Height (Inches),Weight (Pounds),Age,Grip strength,Frailty,frailty,frailty_binary
0,65.8,112,30,30,N,No,0
1,71.5,136,19,31,N,No,0
2,69.4,153,45,29,N,No,0
3,68.2,142,22,28,Y,Yes,1
4,67.8,144,29,24,Y,Yes,1


#5. Outlier Detection and Handling.

Using the IQR method to detect outliers in 'grip_strength'.

Define lower and upper bounds for 'grip_strength'.

Find outliers in 'grip_strength'.

Optionally, you can remove these outliers or handle them differently

In [53]:
Q1 = data['Grip strength'].quantile(0.25)
Q3 = data['Grip strength'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data[(data['Grip strength'] < lower_bound) | (data['Grip strength'] > upper_bound)]
outliers.shape[0]

data_cleaned = data[(data['Grip strength'] >= lower_bound) & (data['Grip strength'] <= upper_bound)]

data_cleaned.head()

Unnamed: 0,Height (Inches),Weight (Pounds),Age,Grip strength,Frailty,frailty,frailty_binary
0,65.8,112,30,30,N,No,0
1,71.5,136,19,31,N,No,0
2,69.4,153,45,29,N,No,0
3,68.2,142,22,28,Y,Yes,1
4,67.8,144,29,24,Y,Yes,1


#  Final Save

In [54]:
cleaned_file_path = 'Frailty_cleaned_data.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)