# 🔍 Handling Partial Duplicates

This notebook challenges you to identify and address **partial duplicates** in a dataset.

In [1]:
# Step 1: Load dataset
import pandas as pd
df = pd.DataFrame({
    'id': [101, 102, 103, 104, 105, 106, 107],
    'name': ['Jon Smith', 'John Smith', 'J. Smith', 'Alice B.', 'Alice Brown', 'Bob', 'Bobby'],
    'email': ['jon.smith@mail.com', 'john.smith@mail.com', 'smith.j@mail.com',
              'aliceb@mail.com', 'alice.brown@mail.com', 'bob@mail.com', 'bob@mail.com'],
    'age': [34, 34, 34, 29, 29, 45, 45],
    'city': ['New York', 'New York', 'New York', 'Los Angeles', 'Los Angeles', 'Chicago', 'Chicago']
})
df

Unnamed: 0,id,name,email,age,city
0,101,Jon Smith,jon.smith@mail.com,34,New York
1,102,John Smith,john.smith@mail.com,34,New York
2,103,J. Smith,smith.j@mail.com,34,New York
3,104,Alice B.,aliceb@mail.com,29,Los Angeles
4,105,Alice Brown,alice.brown@mail.com,29,Los Angeles
5,106,Bob,bob@mail.com,45,Chicago
6,107,Bobby,bob@mail.com,45,Chicago


## Step 2: Explore potential duplicates by age and city

In [2]:
# Group entries with same age and city
df.groupby(['age', 'city']).size().reset_index(name='count')

Unnamed: 0,age,city,count
0,29,Los Angeles,2
1,34,New York,3
2,45,Chicago,2


## Step 3: Flag repeated emails

In [3]:
# Count how many times each email appears
df['email_count'] = df.groupby('email')['email'].transform('count')
df[df['email_count'] > 1]

Unnamed: 0,id,name,email,age,city,email_count
5,106,Bob,bob@mail.com,45,Chicago,2
6,107,Bobby,bob@mail.com,45,Chicago,2


## Step 4: Final dataset


In [7]:
# Remove duplicate rows based on all columns except 'id' and keep the first occurrence
df_no_duplicates = df.drop_duplicates(subset=['name', 'email', 'age', 'city'])
#df_no_duplicates = df.drop_duplicates(subset=['email'])
df_no_duplicates

Unnamed: 0,id,name,email,age,city,email_count
0,101,Jon Smith,jon.smith@mail.com,34,New York,1
1,102,John Smith,john.smith@mail.com,34,New York,1
2,103,J. Smith,smith.j@mail.com,34,New York,1
3,104,Alice B.,aliceb@mail.com,29,Los Angeles,1
4,105,Alice Brown,alice.brown@mail.com,29,Los Angeles,1
5,106,Bob,bob@mail.com,45,Chicago,2
6,107,Bobby,bob@mail.com,45,Chicago,2
