# Deleting Duplicate Data and Single-Value Columns [EXTRA]

_Demonstrating deleting duplicate data and single_valued columns as data preprocessing activities._

In [1]:
# Imports required packages
import pandas as pd

## Deleting Duplicate Data

In [2]:
# Loads data

data = pd.read_csv("./../Data/iris/iris.csv", header=None)

display(data)

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
# Shows if there are duplicated data

duplicate = data.duplicated()
duplicate_count = sum(data.duplicated())

print("There are", duplicate_count, "duplicates in the data set.\n")

# Shows all duplicate data
if duplicate_count > 0:
    print("Duplicate Data:\n", data[duplicate])

There are 3 duplicates in the data set.

Duplicate Data:
        0    1    2    3               4
34   4.9  3.1  1.5  0.1     Iris-setosa
37   4.9  3.1  1.5  0.1     Iris-setosa
142  5.8  2.7  5.1  1.9  Iris-virginica


In [4]:
# Prints the shape of data before removal of duplicate
print("Data shape before duplicate removal:", data.shape)

# Deletes duplicate
data.drop_duplicates(inplace=True)

# Prints the shape of data before removal of duplicate
print("Data shape after duplicate removal:", data.shape)

Data shape before duplicate removal: (150, 5)
Data shape after duplicate removal: (147, 5)


## Removing Single-Value Columns

In [5]:
# Loads data

data = pd.read_csv("./../Data/oil-spill.csv", header=None)

display(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,...,2850.00,1000.00,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
1,2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,...,5750.00,11500.00,9593.48,1648.80,0.60,0,51572.04,65.73,6.26,0
2,3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,...,1400.00,250.00,150.00,45.13,9.33,1,31692.84,65.81,7.84,1
3,4,1201,1562.53,295.65,66,3002500.0,42.40,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
4,5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932,200,12,92.42,364.42,135,97200.0,59.42,10.34,884.0,0.17,...,381.84,254.56,84.85,146.97,4.50,0,2593.50,65.85,6.39,0
933,201,11,98.82,248.64,159,89100.0,59.64,10.18,831.0,0.17,...,284.60,180.00,150.00,51.96,1.90,0,4361.25,65.70,6.53,0
934,202,14,25.14,428.86,24,113400.0,60.14,17.94,847.0,0.30,...,402.49,180.00,180.00,0.00,2.24,0,2153.05,65.91,6.12,0
935,203,10,96.00,451.30,68,81000.0,59.90,15.01,831.0,0.25,...,402.49,180.00,90.00,73.48,4.47,0,2421.43,65.97,6.32,0


In [6]:
# get number of unique values for each column
unique_values_per_attrib = data.nunique()

# record columns to delete
single_value_columns = [i for i, value_count in enumerate(unique_values_per_attrib) \
                        if value_count == 1]
print("Single-value column indexes", single_value_columns)

# Prints the shape of data before removal of single-value columns
print("Data shape before single-value column removal:", data.shape)

# Deletes single-value columns
data.drop(single_value_columns, axis=1, inplace=True)

# Prints the shape of data after removal of single-value columns
print("Data shape after single-value column removal:", data.shape)

Single-value column indexes [22]
Data shape before single-value column removal: (937, 50)
Data shape after single-value column removal: (937, 49)
