In [1]:
import pandas as pd

In [2]:
df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"])

<!-- Duplicate Label Detection -->

In [3]:
df1.index.is_unique

True

In [5]:
df1.columns.is_unique

False

In [6]:
# .duplicated() will return a boolean ndarray indicating whether a label is repeated.
df1.index.duplicated()

array([False, False])

In [7]:
df1.loc[~df1.index.duplicated(), :]


Unnamed: 0,A,A.1,B
0,0,1,2
1,3,4,5


In [8]:
# If you need additional logic to handle duplicate labels, rather than just dropping the repeats, using groupby() on the index is a common trick. For example, we’ll resolve duplicates by taking the average of all rows with the same label.
df1.groupby(level=0).mean()

Unnamed: 0,A,A.1,B
0,0.0,1.0,2.0
1,3.0,4.0,5.0


In [9]:
# Disallowing Duplicate Labels
pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags(
    allows_duplicate_labels=False
)


Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5


In [10]:
# This attribute can be checked or set with allows_duplicate_labels, which indicates whether that object can have duplicate labels.
df = pd.DataFrame({"A": [0, 1, 2, 3]}, index=["x", "y", "X", "Y"]).set_flags(
    allows_duplicate_labels=False
)

In [11]:
df.flags.allows_duplicate_labels

False

In [12]:
# DataFrame.set_flags() can be used to return a new DataFrame with attributes like allows_duplicate_labels set to some value

df2 = df.set_flags(allows_duplicate_labels=True)
df2.flags.allows_duplicate_labels

True

In [13]:
# The new DataFrame returned is a view on the same data as the old DataFrame. Or the property can just be set directly on the same object

df2.flags.allows_duplicate_labels = False
df2.flags.allows_duplicate_labels

False

In [None]:
# When processing raw, messy data you might initially read in the messy data (which potentially has duplicate labels), deduplicate, and then disallow duplicates going forward, to ensure that your data pipeline doesn’t introduce duplicates.
raw = pd.read_csv("...")
deduplicated = raw.groupby(level=0).first()  # remove duplicates
deduplicated.flags.allows_duplicate_labels = False  # disallow going forward

In [None]:
# Setting allows_duplicate_labels=False on a Series or DataFrame with duplicate labels or performing an operation that introduces duplicate labels on a Series or DataFrame that disallows duplicates will raise an errors.DuplicateLabelError.

In [None]:
df.rename(str.upper)

## Duplicate Label Propagation

In [1]:
# In general, disallowing duplicates is “sticky”. It’s preserved through operations.
s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False)
s1

NameError: name 'pd' is not defined

In [None]:
# This is an experimental feature. Currently, many methods fail to propagate the allows_duplicate_labels value. In future versions it is expected that every method taking or returning one or more DataFrame or Series objects will propagate allows_duplicate_labels.

In [None]:
s1.head().rename({"a": "b"})