## Data Cleansing for Health Facilities Dataset

### Load and Examine the Dataset

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/mnt/data/cfafrica-_-data-team-_-outbreak-_-covid19-_-data-_-openafrica-uploads-_-kenya-hospital-ke (1).csv')
data.head()

### Handling Missing Values

In [None]:
# Checking for missing values in each column
missing_values = data.isnull().sum()
missing_values_percentage = (data.isnull().sum() / len(data)) * 100

missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_values_percentage
})

missing_data.sort_values(by='Percentage', ascending=False)

In [None]:
# Dropping the 'Service_names' column
data.drop('Service_names', axis=1, inplace=True)

# Dropping rows where 'Registration_number' is missing
data.dropna(subset=['Registration_number'], inplace=True)

# Checking the data types of the columns
data_types = data.dtypes
data_types

### Checking and Correcting Data Types

In [None]:
import numpy as np

# Convert 'None' in 'Code' column to NaN
data['Code'] = data['Code'].replace('None', np.nan)

# Convert 'Beds' column to numerical values by removing commas
data['Beds'] = data['Beds'].str.replace(',', '').astype(float)

# Recheck the data types
updated_data_types = data.dtypes

updated_data_types

### Removing Duplicates

In [None]:
# Checking for duplicate rows
duplicate_rows = data[data.duplicated()]

duplicate_count = len(duplicate_rows)
duplicate_count

### Handling Outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting box plots for numerical columns
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data['Beds'])
plt.title('Beds')

plt.subplot(1, 3, 2)
sns.boxplot(data['Cots'])
plt.title('Cots')

plt.subplot(1, 3, 3)
sns.boxplot(data['Beds and Cots'])
plt.title('Beds and Cots')

plt.tight_layout()
plt.show()

### Correcting Inconsistent Data Entries

In [None]:
# Standardizing the 'Facility type' column to title case
data['Facility type'] = data['Facility type'].str.title()

# Rechecking unique values for 'Facility type'
unique_facility_types = data['Facility type'].unique()

unique_facility_types