In [13]:
import pandas as pd

# Create a list of data with duplicate columns
data = [
    [1,2,3,6,6,6],  #Row 1
    [6,7,8,8,8,8],  #Row 2
    [11,12,13,8,8,8],  #Row 3
    [11,12,13,33,33,33], #Row 4
    [11,12,13,33,33,33]  #Row 5
]

# Create the DataFrame with the same column names
df = pd.DataFrame(data, columns=['A','B','C','D','E','F'])
df

Unnamed: 0,A,B,C,D,E,F
0,1,2,3,6,6,6
1,6,7,8,8,8,8
2,11,12,13,8,8,8
3,11,12,13,33,33,33
4,11,12,13,33,33,33


# Removing Duplicate Rows

To remove rows where all column values are identical, use the drop_duplicates() method without specifying any columns.

In [14]:
df.drop_duplicates() # Return DataFrame with duplicate rows removed.

Unnamed: 0,A,B,C,D,E,F
0,1,2,3,6,6,6
1,6,7,8,8,8,8
2,11,12,13,8,8,8
3,11,12,13,33,33,33


# Removing Duuplicate Columns

In [15]:
df.drop_duplicates("E") # removes rows from the DataFrame where the values in column "E" are duplicated, 
                        # keeping only the first occurrence of each unique value in column "E".
                        # It doesn't check for duplicates in the entire row but focuses only on column "E".

Unnamed: 0,A,B,C,D,E,F
0,1,2,3,6,6,6
1,6,7,8,8,8,8
3,11,12,13,33,33,33


Pandas' drop_duplicates() doesn't directly support removing duplicate columns based on their values. To do this, we need to first transpose the DataFrame using df.T. Transposing switches rows and columns, allowing us to treat columns as rows and remove duplicates as if they were rows.

In [16]:
df.T  # Transpose the dataset with T attribute

Unnamed: 0,0,1,2,3,4
A,1,6,11,11,11
B,2,7,12,12,12
C,3,8,13,13,13
D,6,8,8,33,33
E,6,8,8,33,33
F,6,8,8,33,33


In [17]:
a = df.T
a.drop_duplicates(inplace= True)

In [18]:
a

Unnamed: 0,0,1,2,3,4
A,1,6,11,11,11
B,2,7,12,12,12
C,3,8,13,13,13
D,6,8,8,33,33


In [19]:
a.T

Unnamed: 0,A,B,C,D
0,1,2,3,6
1,6,7,8,8
2,11,12,13,8
3,11,12,13,33
4,11,12,13,33
