# **Handling Duplicate Data**

In [1]:
import pandas as pd
import numpy as np

data = {
    'EmpID': [101, 102, 103, 101, 104, 105, 102, 106, 107, 107],
    'Name': ['Ali', 'Sara', 'Ahmed', 'Ali', 'Zara', 'Usman', 'Sara', 'Hina', 'Bilal', 'Bilal'],
    'Dept': ['IT', 'HR', 'IT', 'IT', 'Finance', 'HR', 'HR', 'IT', 'HR', 'HR'],
    'Email': [
        'ali@corp.com',
        'sara@corp.com',
        'ahmed@corp.com',
        'ali@corp.com',          # duplicate email
        'zara@corp.com',
        'usman@corp.com',
        'sara@corp.com',         # duplicate email
        'hina@corp.com',
        'bilal@corp.com',
        'bilal@corp.com'         # duplicate email
    ],
    'Salary': [50000, 60000, 55000, 50000, 65000, 58000, 60000, 52000, 57000, 57000],
    'Bonus': [5000, 6000, 5500, 5000, 7000, 5800, None, 5200, 6000, 6000],
    'JoinDate': pd.to_datetime([
        '2022-01-01',
        '2021-06-15',
        '2022-03-20',
        '2022-01-01',
        '2020-09-10',
        '2021-11-05',
        '2021-06-15',
        '2022-07-01',
        '2021-04-18',
        '2021-04-18'
    ]),
    'UpdatedAt': pd.to_datetime([
        '2024-01-10',
        '2024-01-12',
        '2024-01-11',
        '2024-02-01',   # later update for Ali
        '2024-01-15',
        '2024-01-18',
        '2024-02-10',   # later update for Sara
        '2024-01-20',
        '2024-01-25',
        '2024-02-05'    # later update for Bilal
    ])
}

df = pd.DataFrame(data)

# Introduce duplicate index deliberately
df.index = [0, 1, 2, 0, 4, 5, 1, 7, 8, 8]

df


Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-02-01
4,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
5,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
1,102,Sara,HR,sara@corp.com,60000,,2021-06-15,2024-02-10
7,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-02-05


---
# **1. Detecting Duplicate Rows**
### **Check if ANY Duplicates Exist**

In [2]:
df.duplicated().any()

np.False_

### **Identify Duplicate Rows**

In [3]:
df[df.duplicated()]

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt


### **Count Duplicate Rows**


In [4]:
df.duplicated().sum()

np.int64(0)

### **Detect Duplicates Based On Specific Columns**


In [5]:
df.duplicated(subset=['Email'])
df['Email'].duplicated()

0    False
1    False
2    False
0     True
4    False
5    False
1     True
7    False
8    False
8     True
Name: Email, dtype: bool

### **Get Rows Containing Duplicates Based on Specific Columns `subset=['ColName']`**

In [6]:
df[df['Email'].duplicated()]
df[df.duplicated(subset=['Email'])]
df[df.duplicated(subset=['Name', 'Dept'])]


Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-02-01
1,102,Sara,HR,sara@corp.com,60000,,2021-06-15,2024-02-10
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-02-05


---
# **2. Removing Duplicate Rows**
### **Remove Duplicates (keep first)**

In [7]:
df.drop_duplicates()

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-02-01
4,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
5,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
1,102,Sara,HR,sara@corp.com,60000,,2021-06-15,2024-02-10
7,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-02-05


### **Remove Duplicates (keep last)**

In [8]:
df.drop_duplicates(keep='last')

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-02-01
4,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
5,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
1,102,Sara,HR,sara@corp.com,60000,,2021-06-15,2024-02-10
7,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-02-05


### **Remove All Duplicates (keep none)**

In [9]:
df.drop_duplicates(keep=False)

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-02-01
4,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
5,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
1,102,Sara,HR,sara@corp.com,60000,,2021-06-15,2024-02-10
7,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-02-05


### **Removing Rows Containing Duplicates Based on Specific Columns `subset=['ColName']`**

In [10]:
df.drop_duplicates(subset=['Email'])

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
4,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
5,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
7,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25


---
# **3. Handling Duplicate Indexes**
### **Detect Duplicate Indexes**

In [11]:
df.index.duplicated()

array([False, False, False,  True, False, False,  True, False, False,
        True])

### **Get Rows Containing Duplicate Indexes**


In [12]:
df[df.index.duplicated()]

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-02-01
1,102,Sara,HR,sara@corp.com,60000,,2021-06-15,2024-02-10
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-02-05


### **Remove Duplicate Indexes (Keep First)**


In [13]:
df=df[~df.index.duplicated()]
df

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
4,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
5,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
7,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
8,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25


### **Reset Index (most common fix)**


In [14]:
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt
0,101,Ali,IT,ali@corp.com,50000,5000.0,2022-01-01,2024-01-10
1,102,Sara,HR,sara@corp.com,60000,6000.0,2021-06-15,2024-01-12
2,103,Ahmed,IT,ahmed@corp.com,55000,5500.0,2022-03-20,2024-01-11
3,104,Zara,Finance,zara@corp.com,65000,7000.0,2020-09-10,2024-01-15
4,105,Usman,HR,usman@corp.com,58000,5800.0,2021-11-05,2024-01-18
5,106,Hina,IT,hina@corp.com,52000,5200.0,2022-07-01,2024-01-20
6,107,Bilal,HR,bilal@corp.com,57000,6000.0,2021-04-18,2024-01-25


---
# **4. Duplicate Values (Column-Level)**
### **Identify  Duplicated Values in a Column**

In [15]:
df['Email'].duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
Name: Email, dtype: bool

### **Get Rows Containing Duplicate Values**


In [16]:
df[df['Email'].duplicated()]

Unnamed: 0,EmpID,Name,Dept,Email,Salary,Bonus,JoinDate,UpdatedAt


### **Count Frequency of Duplicates**

In [17]:
df['Email'].value_counts()
df['Email'].value_counts().loc[lambda x:x >1]

Series([], Name: count, dtype: int64)

---
# **5.  Best Techniques (REAL WORLD)**

In [18]:
user_activity = pd.DataFrame({
    'UserID': [1, 1, 1, 2, 2, 3, 4, 4],
    'Email': [
        'ali@gmail.com',
        'ali@gmail.com',
        'ali@gmail.com',   # same user, multiple updates
        'sara@gmail.com',
        'sara@gmail.com',
        'ahmed@gmail.com',
        'bilal@gmail.com',
        'bilal@gmail.com'  # duplicate email
    ],
    'PageViews': [5, 7, 10, 3, 6, 4, 8, 9],
    'PurchaseAmount': [200, 0, 300, 150, 200, 0, 100, 250],
    'Visits': [1, 2, 3, 1, 2, 1, 2, 3],
    'updated_at': pd.to_datetime([
        '2024-01-01',
        '2024-01-10',
        '2024-02-01',   # latest record for User 1
        '2024-01-05',
        '2024-01-20',   # latest record for User 2
        '2024-01-15',
        '2024-01-18',
        '2024-02-05'    # latest record for User 4
    ])
})
user_activity

Unnamed: 0,UserID,Email,PageViews,PurchaseAmount,Visits,updated_at
0,1,ali@gmail.com,5,200,1,2024-01-01
1,1,ali@gmail.com,7,0,2,2024-01-10
2,1,ali@gmail.com,10,300,3,2024-02-01
3,2,sara@gmail.com,3,150,1,2024-01-05
4,2,sara@gmail.com,6,200,2,2024-01-20
5,3,ahmed@gmail.com,4,0,1,2024-01-15
6,4,bilal@gmail.com,8,100,2,2024-01-18
7,4,bilal@gmail.com,9,250,3,2024-02-05


### **Technique 1: Keep Row with Latest Timestamp**

In [19]:
user_activity.sort_values(by='updated_at',inplace=True)
user_activity.drop_duplicates(subset='UserID', keep='last', inplace=True)
user_activity

Unnamed: 0,UserID,Email,PageViews,PurchaseAmount,Visits,updated_at
5,3,ahmed@gmail.com,4,0,1,2024-01-15
4,2,sara@gmail.com,6,200,2,2024-01-20
2,1,ali@gmail.com,10,300,3,2024-02-01
7,4,bilal@gmail.com,9,250,3,2024-02-05


### **Technique 2: Aggregate Instead of Drop**
ðŸ§  Prefer aggregation when data is meaningful, not noise.

In [20]:
user_activity.groupby('UserID',as_index=False).agg({ 'PurchaseAmount':'sum','Visits':'max'})

Unnamed: 0,UserID,PurchaseAmount,Visits
0,1,300,3
1,2,200,2
2,3,0,1
3,4,250,3


### **Technique 3: Deduplicate Before Merge**

In [21]:
# right_df = right_df.drop_duplicates(subset='ID')
# pd.merge(left_df, right_df, on='ID')

### **Technique 4: Flag Instead of Remove**

In [22]:
user_activity['is_duplicate']=user_activity.duplicated(subset='UserID',keep=False)
user_activity

Unnamed: 0,UserID,Email,PageViews,PurchaseAmount,Visits,updated_at,is_duplicate
5,3,ahmed@gmail.com,4,0,1,2024-01-15,False
4,2,sara@gmail.com,6,200,2,2024-01-20,False
2,1,ali@gmail.com,10,300,3,2024-02-01,False
7,4,bilal@gmail.com,9,250,3,2024-02-05,False
