In [4]:
import pandas as pd
 
# Load the dataset
file_path = "merged_data.csv"  # Change this to your local file path
df = pd.read_csv(file_path)
 
# Remove duplicates
df_cleaned = df.drop_duplicates()
 
# Convert 'score_date' to datetime and retain rows even if parsing fails
df_cleaned['score_date'] = pd.to_datetime(df_cleaned['score_date'], errors='coerce')
 
# Fill missing numerical values with the median
num_cols = df_cleaned.select_dtypes(include=['number']).columns
df_cleaned[num_cols] = df_cleaned[num_cols].fillna(df_cleaned[num_cols].median())
 
# Fill missing categorical values with 'Unknown'
cat_cols = df_cleaned.select_dtypes(include=['object']).columns
df_cleaned[cat_cols] = df_cleaned[cat_cols].fillna('Unknown')
 
# Ensure binary columns are explicitly int64
binary_cols = [col for col in df_cleaned.columns if set(df_cleaned[col].dropna().unique()).issubset({0, 1})]
df_cleaned[binary_cols] = df_cleaned[binary_cols].astype('int64')
 
# Convert 'seat_utilization' to float if incorrectly stored as object
if 'seat_utilization' in df_cleaned.columns:
    df_cleaned['seat_utilization'] = pd.to_numeric(df_cleaned['seat_utilization'], errors='coerce')
 
# Identify columns that should be float (ratios, percentages, revenue, etc.)
float_cols = [
    'nps_score', 'nps_score_all_time', 'avg_nps_rating', 'avg_nps_rating_all_time',
    'current_arr', 'future_arr', 'arr_change', 'fx_impact', 'seat_utilization',
    'seat_utilization_M_before', 'agent_utilization'
]
existing_float_cols = [col for col in float_cols if col in df_cleaned.columns]
df_cleaned[existing_float_cols] = df_cleaned[existing_float_cols].astype('float64')
 
# Convert categorical columns to category type (only if they exist)
cat_cols = ['id', 'crm_industry_current', 'crm_employee_range']
existing_cat_cols = [col for col in cat_cols if col in df_cleaned.columns]
df_cleaned[existing_cat_cols] = df_cleaned[existing_cat_cols].astype('category')
 
# Save the cleaned dataset
cleaned_file_path = "final_fixed_merged_data.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)
 
print(f"Cleaned dataset saved as: {cleaned_file_path}")

Cleaned dataset saved as: final_fixed_merged_data.csv
