In [2]:
# 1) Imports & file path
import pandas as pd
from pathlib import Path

csv_path = Path("Customer360.csv")  # change if your file is elsewhere
assert csv_path.exists(), f"File not found: {csv_path}"


In [3]:
# 2) Load the CSV
df = pd.read_csv(csv_path)

print("Before de-duplication:")
print(f"Rows: {len(df):,}")
print(f"Columns: {list(df.columns)}")


Before de-duplication:
Rows: 92,910
Columns: ['Lead_Company', 'VW_technology', 'URL', 'Source_Name']


In [4]:
# 3) (Optional but recommended) normalize the Lead_Company values
#    This helps catch duplicates that only differ by case or extra spaces.
col = "Lead_Company"
if col not in df.columns:
    raise KeyError(f"Column '{col}' not found in CSV.")

df[col] = (
    df[col]
    .astype(str)         # ensure string
    .str.strip()         # trim spaces
    .str.replace(r"\s+", " ", regex=True)  # collapse multiple spaces
    .str.lower()         # make case-insensitive comparisons
)


In [5]:
# 4) Drop duplicates based on Lead_Company, keeping the first occurrence
before = len(df)
df_dedup = df.drop_duplicates(subset=[col], keep="first").reset_index(drop=True)
after = len(df_dedup)

print(f"Removed {before - after:,} duplicate rows based on '{col}'.")
print(f"After de-duplication: {after:,} rows.")


Removed 69,885 duplicate rows based on 'Lead_Company'.
After de-duplication: 23,025 rows.


In [6]:
# 5) Save back to the SAME file (overwrites the original)
df_dedup.to_csv(csv_path, index=False)
print(f"Saved de-duplicated data back to: {csv_path}")


Saved de-duplicated data back to: Customer360.csv
