In [3]:
import pandas as pd
import re
import codecs

df = pd.read_csv("startup_funding_clean.csv")
def clean_startup_name(name):
    if pd.isnull(name):
        return None

    name = str(name).strip()

    # Decode escaped unicode sequences if present
    try:
        name = codecs.decode(name, 'unicode_escape')
    except Exception:
        pass

    # Remove all visible or hidden non-breaking space variants
    name = re.sub(r'(\\x[a-f0-9]{2})+', ' ', name)   # handles \xc2\xa0
    name = re.sub(r'Xc2Xa0', ' ', name, flags=re.IGNORECASE)  # handles literal Xc2Xa0
    name = name.replace('\xa0', ' ').replace('\xc2\xa0', ' ')

    # Remove URLs
    name = re.sub(r'http\S+|www\.\S+', '', name)

    # Replace fancy apostrophes/quotes
    name = name.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"')

    # Remove unwanted characters (keep . ' & / -)
    name = re.sub(r'[^a-zA-Z0-9\s\.\'\&\-/]', '', name)

    # Normalize spaces
    name = re.sub(r'\s+', ' ', name).strip()

    # Title-case formatting
    if name:
        name = name.title()

    return name

# Apply to your dataframe
df['Startup Name'] = df['Startup Name'].apply(clean_startup_name)

print("Unique startup count:", df['Startup Name'].nunique())
print(df['Startup Name'].head(20))


Unique startup count: 1634
0                           Byju'S
1                           Shuttl
2                        Mamaearth
3                                 
4                           Fashor
5                            Pando
6                           Zomato
7                           Ecozen
8                         Cardekho
9                     Dhruva Space
10                          Rivigo
11                      Healthians
12                         Licious
13                          Incred
14                           Trell
15                      Rein Games
16                    Lenskart.Com
17                      Freshworks
18                         Misters
19    Sunstone Eduversity Pvt. Ltd
Name: Startup Name, dtype: object


In [2]:
# Save cleaned dataset
df.to_csv("cleaned_startups_funding.csv", index=False)