In [3]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('RAW_BGMEA.csv')

# Function to clean and standardize phone numbers
def clean_phone_number(phone):

    if pd.isna(phone):  # Handle missing or NaN values
        return None
    # Remove non-numeric characters except for leading '+' for international formats
    phone = re.sub(r'[^0-9\+]', '', str(phone))
    
    # Ensure the phone number starts with +88 if it is a Bangladeshi number
    if phone.startswith('88') and not phone.startswith('+88'):
        phone = '+88' + phone[2:]
    elif not phone.startswith('+88'):
        phone = '+88' + phone
    
    # Normalize the length (Bangladeshi numbers should have 13 digits starting with +88)
    if len(phone) == 13 and phone.startswith('+88'):
        return phone
    else:
        return None  # Invalid phone number format

# Function to clean email addresses (extract a valid email if multiple exist)
def clean_email(email):
    # Find the first valid email (if multiple emails are present)
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', str(email))
    if emails:
        return emails[0]  # Return the first email found
    else:
        return None  # Return None if no valid email is found

# Apply the cleaning functions to the relevant columns
for col in ['number']:
    df[col] = df[col].apply(clean_phone_number)

for col in ['mail']:
    df[col] = df[col].apply(clean_email)

# Save the cleaned data to a new CSV file
df.to_csv('Clean_BGMEA.csv', index=False)

print("Data cleaned and saved to 'cleaned_dse.csv'.")


Data cleaned and saved to 'cleaned_dse.csv'.
