In [5]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np

In [16]:
# Step 2: Set full path to your Excel file in Downloads
file_path = r"C:\Users\hp\Downloads\TL20251023Mock Survey.xlsx"

In [17]:
# Step 3: Load the Excel file (first sheet)
df = pd.read_excel(file_path, sheet_name=0) 

In [55]:
# Step 4: Verify that the dataset loaded correctly
print(" Dataset loaded successfully!")
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])



 Dataset loaded successfully!
Number of rows: 123
Number of columns: 11


In [20]:
df.head()

Unnamed: 0,Submission Date,Region,Survey Type,Respondent ID,Gender,Age Group,Q1: I have access to basic health services,Q2: My household has enough food to eat,Q3: My children can attend school safely,Q4: I feel safe in my shelter,Q5: I have access to clean water
0,2025-07-24 16:22:10.753,East Africa,Shelter,RESP1072,Prefer not to say,18-24,5,5,5,2,1
1,2025-07-25 16:22:10.753,East Africa,Water & Sanitation,RESP1010,Prefer not to say,25-34,4,4,2,5,3
2,2025-07-26 16:22:10.753,Middle East,Education,RESP1098,Female,Under 18,3,2,3,5,5
3,2025-07-26 16:22:10.753,Latin America,Health Access,RESP1100,Non-binary,25-34,5,2,1,4,5
4,2025-07-27 16:22:10.753,South Asia,He alth Access,RESP1042,Prefer not to say,25-34,5,5,4,2,5


In [56]:
# Step 5: Inspect data structure, types, and quality

#View column names and data types
print("Dataset information:")
df.info()

Dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 123 entries, 0 to 129
Data columns (total 11 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   submission_date                             123 non-null    datetime64[ns]
 1   region                                      123 non-null    object        
 2   survey_type                                 123 non-null    object        
 3   respondent_id                               123 non-null    object        
 4   gender                                      123 non-null    object        
 5   age_group                                   123 non-null    object        
 6   q1:_i_have_access_to_basic_health_services  123 non-null    int64         
 7   q2:_my_household_has_enough_food_to_eat     123 non-null    int64         
 8   q3:_my_children_can_attend_school_safely    123 non-null    int64         

In [22]:
#Check for missing (null) values


print("\n Missing values per column:")
print(df.isnull().sum())


 Missing values per column:
Submission Date                               0
Region                                        0
Survey Type                                   0
Respondent ID                                 0
Gender                                        0
Age Group                                     0
Q1: I have access to basic health services    0
Q2: My household has enough food to eat       0
Q3: My children can attend school safely      0
Q4: I feel safe in my shelter                 0
Q5: I have access to clean water              0
dtype: int64


In [23]:
# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"\n Number of duplicate rows: {duplicate_count}")


 Number of duplicate rows: 7


In [24]:
#Check random rows for context

df.sample(5)

Unnamed: 0,Submission Date,Region,Survey Type,Respondent ID,Gender,Age Group,Q1: I have access to basic health services,Q2: My household has enough food to eat,Q3: My children can attend school safely,Q4: I feel safe in my shelter,Q5: I have access to clean water
90,2025-09-30 16:22:10.753,Middle East,Education,RESP1092,Non-binary,Under 18,4,5,1,2,3
47,2025-08-22 16:22:10.753,East Africa,He alth Access,RESP1085,Female,35-44,1,3,3,5,5
48,2025-08-22 16:22:10.753,East Africa,Food Security,RESP1059,Non-binary,35-44,2,1,1,3,5
24,2025-08-08 16:22:10.753,West Africa,Health Access,RESP1031,Male,18-24,2,4,5,4,3
62,2025-09-07 16:22:10.753,East Africa,Health Access,RESP1103,Female,55+,5,1,1,5,2


Step 3: Clean Column Names and Text Data

In this step, I standardized the column names and text entries to ensure consistency throughout the dataset.  
This included:
- Removing spaces and converting names to lowercase for easier reference.
- Trimming extra spaces in text cells.
- Ensuring all textual categories (like *Region*, *Gender*, *Age Group*, etc.) use consistent formatting.

These steps help prevent issues during analysis (for example, `"North "` vs `"North"` would be treated as the same value).


In [57]:
# Step 6: Clean column names and text data

# Standardize column names (remove spaces, lowercase, replace spaces with underscores)

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(" Cleaned column names:")
print(df.columns.tolist())

 Cleaned column names:
['submission_date', 'region', 'survey_type', 'respondent_id', 'gender', 'age_group', 'q1:_i_have_access_to_basic_health_services', 'q2:_my_household_has_enough_food_to_eat', 'q3:_my_children_can_attend_school_safely', 'q4:_i_feel_safe_in_my_shelter', 'q5:_i_have_access_to_clean_water']


In [26]:
# Identify all string (object) columns


string_cols = df.select_dtypes(include='object').columns

In [27]:
# Trim 

for col in string_cols:
    df[col] = df[col].astype(str).str.strip()

In [28]:
print("\n Trimmed extra spaces in all text columns.")


 Trimmed extra spaces in all text columns.


In [29]:
# Display a few cleaned rows to verify

df.head()

Unnamed: 0,submission_date,region,survey_type,respondent_id,gender,age_group,q1:_i_have_access_to_basic_health_services,q2:_my_household_has_enough_food_to_eat,q3:_my_children_can_attend_school_safely,q4:_i_feel_safe_in_my_shelter,q5:_i_have_access_to_clean_water
0,2025-07-24 16:22:10.753,East Africa,Shelter,RESP1072,Prefer not to say,18-24,5,5,5,2,1
1,2025-07-25 16:22:10.753,East Africa,Water & Sanitation,RESP1010,Prefer not to say,25-34,4,4,2,5,3
2,2025-07-26 16:22:10.753,Middle East,Education,RESP1098,Female,Under 18,3,2,3,5,5
3,2025-07-26 16:22:10.753,Latin America,Health Access,RESP1100,Non-binary,25-34,5,2,1,4,5
4,2025-07-27 16:22:10.753,South Asia,He alth Access,RESP1042,Prefer not to say,25-34,5,5,4,2,5


### Handle Missing Values and Remove Duplicates

In this step, I checked for and handled any missing or duplicate records to make sure the dataset is complete and reliable.

Things i did:
- **Numeric columns** (e.g., Q1‚ÄìQ4): replaced missing values with the column *median*, which is less affected by outliers than the mean.
- **Categorical columns** (e.g., Region, Gender, Age Group): filled missing values with `"Unknown"` to preserve data consistency.
- **Duplicates**: removed any exact duplicate rows to avoid counting the same response more than once.

These actions ensure there are no gaps or redundant records before analysis.


In [59]:
# Handle missing values and remove duplicates

import numpy as np

# Separate numeric and categorical columns
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns


In [60]:
# Handle missing values

#replace NaN with Median
for col in num_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

In [61]:
# Categorical columns: replace NaN with 'Unknown'

for col in cat_cols:
    df[col].fillna('Unknown', inplace=True)

In [38]:
print("Missing values handled successfully")

Missing values handled successfully


In [62]:
# Remove duplicates

before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]

print(f"Removed {before - after} duplicate rows.")

Removed 0 duplicate rows.


In [41]:
#Check no missing values

print("\nüîç Remaining missing values per column:")
print(df.isnull().sum())


üîç Remaining missing values per column:
submission_date                               0
region                                        0
survey_type                                   0
respondent_id                                 0
gender                                        0
age_group                                     0
q1:_i_have_access_to_basic_health_services    0
q2:_my_household_has_enough_food_to_eat       0
q3:_my_children_can_attend_school_safely      0
q4:_i_feel_safe_in_my_shelter                 0
q5:_i_have_access_to_clean_water              0
dtype: int64


### Convert Question Columns to Numeric and Standardize Categories

This step ensures all question columns (Q1‚ÄìQ4) are stored as numeric values and all categorical fields are formatted consistently.

Actions performed:
- Identified all survey question columns that start with "Q".
- Converted them to numeric values (in case some entries were text like "N/A" or "‚Äì").
- Standardized text categories (Region, Gender, Age Group, Survey Type) using *Title Case* to avoid mismatched labels (e.g., "north" and "North").


In [64]:
# Convert survey question columns to numeric and standardize categorical data

import re

# Identify all columns that look like "Q1", "Q2", "Q3", etc.
q_cols = [c for c in df.columns if re.match(r'^q\d+', c)]
print("Detected question columns:", q_cols)

Detected question columns: ['q1:_i_have_access_to_basic_health_services', 'q2:_my_household_has_enough_food_to_eat', 'q3:_my_children_can_attend_school_safely', 'q4:_i_feel_safe_in_my_shelter', 'q5:_i_have_access_to_clean_water']


In [65]:
#Convert these columns to numeric (handles text like 'N/A', blanks, etc.)

for col in q_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("Converted question columns to numeric type.")

Converted question columns to numeric type.


In [66]:
# Standardize key categorical columns (convert to Title Case)

for col in ['region', 'age_group', 'gender', 'survey_type']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.title().str.strip()
        
        print(" Standardized categorical columns (Region, Gender, Age Group, Survey Type).")

 Standardized categorical columns (Region, Gender, Age Group, Survey Type).
 Standardized categorical columns (Region, Gender, Age Group, Survey Type).
 Standardized categorical columns (Region, Gender, Age Group, Survey Type).
 Standardized categorical columns (Region, Gender, Age Group, Survey Type).


In [68]:
# check data types

print("\nüìä Updated data types:")
print(df.dtypes)


üìä Updated data types:
submission_date                               datetime64[ns]
region                                                object
survey_type                                           object
respondent_id                                         object
gender                                                object
age_group                                             object
q1:_i_have_access_to_basic_health_services             int64
q2:_my_household_has_enough_food_to_eat                int64
q3:_my_children_can_attend_school_safely               int64
q4:_i_feel_safe_in_my_shelter                          int64
q5:_i_have_access_to_clean_water                       int64
dtype: object


In [69]:
#Preview cleaned dataset

df.head()

Unnamed: 0,submission_date,region,survey_type,respondent_id,gender,age_group,q1:_i_have_access_to_basic_health_services,q2:_my_household_has_enough_food_to_eat,q3:_my_children_can_attend_school_safely,q4:_i_feel_safe_in_my_shelter,q5:_i_have_access_to_clean_water
0,2025-07-24 16:22:10.753,East Africa,Shelter,RESP1072,Prefer Not To Say,18-24,5,5,5,2,1
1,2025-07-25 16:22:10.753,East Africa,Water & Sanitation,RESP1010,Prefer Not To Say,25-34,4,4,2,5,3
2,2025-07-26 16:22:10.753,Middle East,Education,RESP1098,Female,Under 18,3,2,3,5,5
3,2025-07-26 16:22:10.753,Latin America,Health Access,RESP1100,Non-Binary,25-34,5,2,1,4,5
4,2025-07-27 16:22:10.753,South Asia,He Alth Access,RESP1042,Prefer Not To Say,25-34,5,5,4,2,5


###  Verify Cleaned Dataset and Save for Analysis

In this final cleaning step, I verified that:
- All missing values were properly handled.
- No duplicate records remain.
- Each column has the correct data type (numeric or categorical).
- The dataset is now complete and ready for analysis.

Finally, the cleaned dataset was exported as a new Excel file, ready for KPI calculations and visualization.


In [70]:
### Final verification and export cleaned dataset

# Check no missing values remain

print("Checking for any remaining missing values:")
print(df.isnull().sum())

Checking for any remaining missing values:
submission_date                               0
region                                        0
survey_type                                   0
respondent_id                                 0
gender                                        0
age_group                                     0
q1:_i_have_access_to_basic_health_services    0
q2:_my_household_has_enough_food_to_eat       0
q3:_my_children_can_attend_school_safely      0
q4:_i_feel_safe_in_my_shelter                 0
q5:_i_have_access_to_clean_water              0
dtype: int64


In [71]:
#Confirm dataset shape (rows √ó columns)

print(f"\n Final dataset shape: {df.shape[0]} rows √ó {df.shape[1]} columns")


 Final dataset shape: 123 rows √ó 11 columns


In [52]:
#Check data types

print("\n Final data types:")
print(df.dtypes)



 Final data types:
submission_date                               datetime64[ns]
region                                                object
survey_type                                           object
respondent_id                                         object
gender                                                object
age_group                                             object
q1:_i_have_access_to_basic_health_services             int64
q2:_my_household_has_enough_food_to_eat                int64
q3:_my_children_can_attend_school_safely               int64
q4:_i_feel_safe_in_my_shelter                          int64
q5:_i_have_access_to_clean_water                       int64
dtype: object


In [53]:
#Show the first few rows

df.head()

Unnamed: 0,submission_date,region,survey_type,respondent_id,gender,age_group,q1:_i_have_access_to_basic_health_services,q2:_my_household_has_enough_food_to_eat,q3:_my_children_can_attend_school_safely,q4:_i_feel_safe_in_my_shelter,q5:_i_have_access_to_clean_water
0,2025-07-24 16:22:10.753,East Africa,Shelter,RESP1072,Prefer Not To Say,18-24,5,5,5,2,1
1,2025-07-25 16:22:10.753,East Africa,Water & Sanitation,RESP1010,Prefer Not To Say,25-34,4,4,2,5,3
2,2025-07-26 16:22:10.753,Middle East,Education,RESP1098,Female,Under 18,3,2,3,5,5
3,2025-07-26 16:22:10.753,Latin America,Health Access,RESP1100,Non-Binary,25-34,5,2,1,4,5
4,2025-07-27 16:22:10.753,South Asia,He Alth Access,RESP1042,Prefer Not To Say,25-34,5,5,4,2,5


In [54]:
#Export the cleaned dataset to Excel

cleaned_file_path = r"C:\Users\hp\Downloads\Cleaned_TL20251023Mock_Survey.xlsx"
df.to_excel(cleaned_file_path, index=False)

print(f"\n Cleaned dataset saved successfully to:\n{cleaned_file_path}")



 Cleaned dataset saved successfully to:
C:\Users\hp\Downloads\Cleaned_TL20251023Mock_Survey.xlsx
