### Removing all the typos for the Dataset using the reusable code for the better efficiency

### Import Lib.

In [5]:
import pandas as pd
from rapidfuzz import fuzz,process

In [None]:
df=pd.read_csv('C:/Users/5530/OneDrive/Desktop/Mall_Customer_Segmentation/Data_files/customer_arrivals_with_outliers.csv')

In [9]:
""" 
Creation of the function to handle all the cleaning process in the dataset which deals
with all the Gender column typos we can also use this function to handle the 
other columns too
"""

def cleaning_typos(df,column_name,valid_values, threshold=85,fill_value='other'):
  """
    Cleans a categorical column:
    1. Fills missing values with the most frequent value (mode).
    2. Corrects typos using fuzzy matching based on similarity to valid values.

    Parameters:
    - df: DataFrame
    - column_name: Column to clean (e.g., 'Gender')
    - valid_values: List of valid/expected values (e.g., ['male', 'female'])
    - threshold: Minimum match score for fuzzy matching (default = 85)
    - fill_value: Value to assign when no match is found (default = 'Other')

    Returns:
    - df: Cleaned DataFrame
    """
  
  # step1: Handle Missing value in the accessing column
  if df[column_name].isnull().any():
    most_freq = df[column_name].mode()[0]
    df[column_name]=df[column_name].fillna(most_freq)

  #Step2: Handle the typos in the accessing column

  #1. Find all the unique values in the columns
  unique_values = df[column_name].dropna().astype(str).str.lower().unique()
  mapping_dict= {}

  #2. Using teh fuzzy to find the matching values
  for value in unique_values:
    match,score,_= process.extractOne(value,valid_values,scorer=fuzz.ratio)

    if score>=threshold:
      mapping_dict[value]=match.title()
    else:
      mapping_dict[value] = fill_value
  
  df[column_name] = df[column_name].astype(str).str.lower().map(mapping_dict)

  return df

In [None]:
valid_genders = ['male', 'female']

df = cleaning_typos(df, 'Gender', valid_genders)

df.to_csv('C:/Users/5530/OneDrive/Desktop/Mall_Customer_Segmentation/Data_files/customer_arrivals_with_outliers.csv',index=False)