In [1]:
import pandas as pd

In [3]:
order_data = {
    'OrderID': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
    'CustomerID': [1, 2, 1, 3, 4, 2, 5, 1],
    'Product': ['Laptop', 'Mouse', 'Laptop', 'Keyboard', 'Monitor', 
                'Mouse', 'Webcam', 'Laptop'],
    'Quantity': [1, 2, 1, 1, 1, 2, 3, 1],
    'Price': [1200, 25, 1200, 75, 300, 25, 40, 1200],
    'OrderDate': ['2025-01-15', '2025-01-15', '2025-01-15', 
                  '2025-01-16', '2025-01-17', '2025-01-15', 
                  '2025-01-18', '2025-01-15']
}
df_orders = pd.DataFrame(order_data)

print("Original Order Data (with potential duplicates):")
print(df_orders)
print("\nOriginal Data Info:")
df_orders.info()

Original Order Data (with potential duplicates):
   OrderID  CustomerID   Product  Quantity  Price   OrderDate
0     1001           1    Laptop         1   1200  2025-01-15
1     1002           2     Mouse         2     25  2025-01-15
2     1003           1    Laptop         1   1200  2025-01-15
3     1004           3  Keyboard         1     75  2025-01-16
4     1005           4   Monitor         1    300  2025-01-17
5     1006           2     Mouse         2     25  2025-01-15
6     1007           5    Webcam         3     40  2025-01-18
7     1008           1    Laptop         1   1200  2025-01-15

Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   OrderID     8 non-null      int64 
 1   CustomerID  8 non-null      int64 
 2   Product     8 non-null      object
 3   Quantity    8 non-null      int64 
 4   Price       8 non-null     

In [5]:
is_duplicate_default = df_orders.duplicated()
print("Rows marked as duplicate (default - checking ALL columns):")
print(is_duplicate_default)

Rows marked as duplicate (default - checking ALL columns):
0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
dtype: bool


In [None]:
order_content_columns = ['CustomerID', 'Product',
                         'Quantity', 'Price', 'OrderDate']

In [7]:
is_duplicate_by_content = df_orders.duplicated(subset=order_content_columns)
print("\nRows marked as duplicate (based on order content, keeping first):")
print(is_duplicate_by_content)


Rows marked as duplicate (based on order content, keeping first):
0    False
1    False
2     True
3    False
4    False
5     True
6    False
7     True
dtype: bool


In [8]:
print("\nActual duplicate rows based on order content:")
print(df_orders[is_duplicate_by_content])


Actual duplicate rows based on order content:
   OrderID  CustomerID Product  Quantity  Price   OrderDate
2     1003           1  Laptop         1   1200  2025-01-15
5     1006           2   Mouse         2     25  2025-01-15
7     1008           1  Laptop         1   1200  2025-01-15


In [None]:
df_no_duplicates_all_cols = df_orders.drop_duplicates()
print("\nDataFrame after dropping duplicates "
"        (based on ALL columns, keeping first):")
print(df_no_duplicates_all_cols)
print("\nInfo after dropping duplicates (all columns):")
df_no_duplicates_all_cols.info()


DataFrame after dropping duplicates (based on ALL columns, keeping first):
   OrderID  CustomerID   Product  Quantity  Price   OrderDate
0     1001           1    Laptop         1   1200  2025-01-15
1     1002           2     Mouse         2     25  2025-01-15
2     1003           1    Laptop         1   1200  2025-01-15
3     1004           3  Keyboard         1     75  2025-01-16
4     1005           4   Monitor         1    300  2025-01-17
5     1006           2     Mouse         2     25  2025-01-15
6     1007           5    Webcam         3     40  2025-01-18
7     1008           1    Laptop         1   1200  2025-01-15

Info after dropping duplicates (all columns):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   OrderID     8 non-null      int64 
 1   CustomerID  8 non-null      int64 
 2   Product     8 non-null      object
 3   Quantity    8 

In [11]:
order_content_columns = ['CustomerID', 'Product', 'Quantity',
                         'Price', 'OrderDate']

In [None]:
df_no_duplicates_by_content = df_orders.drop_duplicates(subset=order_content_columns, keep='first')
print("\nDataFrame after dropping duplicates (based on order content, keeping first):")
print(df_no_duplicates_by_content)
print("\nInfo after dropping duplicates (by content):")
df_no_duplicates_by_content.info()


DataFrame after dropping duplicates (based on order content, keeping first):
   OrderID  CustomerID   Product  Quantity  Price   OrderDate
0     1001           1    Laptop         1   1200  2025-01-15
1     1002           2     Mouse         2     25  2025-01-15
3     1004           3  Keyboard         1     75  2025-01-16
4     1005           4   Monitor         1    300  2025-01-17
6     1007           5    Webcam         3     40  2025-01-18

Info after dropping duplicates (by content):
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 0 to 6
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   OrderID     5 non-null      int64 
 1   CustomerID  5 non-null      int64 
 2   Product     5 non-null      object
 3   Quantity    5 non-null      int64 
 4   Price       5 non-null      int64 
 5   OrderDate   5 non-null      object
dtypes: int64(4), object(2)
memory usage: 280.0+ bytes
