An example showing how to detect duplicate rows in a data frame.

In [1]:
import numpy as np
import pandas as pd
import data_algebra
import data_algebra.db_space

In [2]:
rng = np.random.default_rng(2022)

In [3]:
def generate_example(*, n_columns: int = 5, n_rows: int = 10):
    assert isinstance(n_columns, int)
    assert isinstance(n_rows, int)
    return pd.DataFrame({
        f"col_{i:03d}": rng.choice(["a", "b", "c", "d"], size=n_rows, replace=True) for i in range(n_columns)
    })

In [4]:
d = generate_example(n_columns=10, n_rows=1000)

d

Unnamed: 0,col_000,col_001,col_002,col_003,col_004,col_005,col_006,col_007,col_008,col_009
0,c,d,d,b,d,a,a,b,d,a
1,a,b,c,d,a,c,c,c,a,b
2,c,c,d,d,a,c,b,d,a,b
3,a,a,d,d,a,c,b,a,a,b
4,a,b,a,a,c,d,a,c,c,b
...,...,...,...,...,...,...,...,...,...,...
995,c,c,d,a,a,c,b,b,b,a
996,d,d,d,a,d,a,c,c,a,b
997,b,a,d,c,c,b,a,d,d,a
998,a,b,c,b,a,d,a,a,b,a


In [5]:
dup_locs_1 = d.duplicated(keep=False)

np.where(dup_locs_1)[0]

array([ 56, 245])

In [6]:

d.loc[dup_locs_1, :]

Unnamed: 0,col_000,col_001,col_002,col_003,col_004,col_005,col_006,col_007,col_008,col_009
56,c,c,d,a,c,a,b,c,d,b
245,c,c,d,a,c,a,b,c,d,b


In [7]:
dup_locs_2 = d.groupby(list(d.columns)).transform("size") > 1
assert np.all(dup_locs_1 == dup_locs_2)

d.loc[dup_locs_2, :]

Unnamed: 0,col_000,col_001,col_002,col_003,col_004,col_005,col_006,col_007,col_008,col_009
56,c,c,d,a,c,a,b,c,d,b
245,c,c,d,a,c,a,b,c,d,b


In [8]:
ops = (
    data_algebra.descr(d=d)
        .extend({"count": "(1).sum()"}, partition_by=d.columns)
        .select_rows("count > 1")
        .drop_columns(["count"])
)

ops_res = ops.transform(d)
assert ops_res.shape[0] == np.sum(dup_locs_1)

ops_res

Unnamed: 0,col_000,col_001,col_002,col_003,col_004,col_005,col_006,col_007,col_008,col_009
0,c,c,d,a,c,a,b,c,d,b
1,c,c,d,a,c,a,b,c,d,b


In [9]:
db_tables = data_algebra.db_space.DBSpace()
db_tables.insert(key="d", value=d)

'd'

In [10]:
res_description = db_tables.execute(ops)

In [11]:
db_res = db_tables.retrieve(res_description.table_name)
assert db_res.shape[0] == np.sum(dup_locs_1)

db_res

Unnamed: 0,col_000,col_001,col_002,col_003,col_004,col_005,col_006,col_007,col_008,col_009
0,c,c,d,a,c,a,b,c,d,b
1,c,c,d,a,c,a,b,c,d,b


In [12]:
big_example = generate_example(n_columns=20, n_rows=5000000)


In [13]:
sum_1 = np.sum(big_example.duplicated(keep=False))

sum_1

28

In [14]:
sum_2 = np.sum(big_example.groupby(list(big_example.columns)).transform("size") > 1)
assert sum_2 == sum_1

In [15]:
big_ops = (
    data_algebra.descr(big_example=big_example)
        .extend({"count": "(1).sum()"}, partition_by=big_example.columns)
        .select_rows("count > 1")
        .drop_columns(["count"])
)
big_res = big_ops.transform(big_example)
assert big_res.shape[0] == sum_1

In [16]:
db_tables.insert(key="big_example", value=big_example)

'big_example'

In [17]:
big_res_description = db_tables.execute(big_ops)

In [18]:
big_db_res = db_tables.retrieve(big_res_description.table_name)
assert big_db_res.shape[0] == sum_1

In [19]:
print(ops.to_sql())

-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SQLiteModel 1.4.6
--       string quote: '
--   identifier quote: "
WITH
 "extend_0" AS (
  SELECT  -- .extend({ 'count': '(1).sum()'}, partition_by=['col_000', 'col_001', 'col_002', 'col_003', 'col_004', 'col_005', 'col_006', 'col_007', 'col_008', 'col_009'])
   "col_000" ,
   "col_001" ,
   "col_002" ,
   "col_003" ,
   "col_004" ,
   "col_005" ,
   "col_006" ,
   "col_007" ,
   "col_008" ,
   "col_009" ,
   SUM(1) OVER ( PARTITION BY "col_000", "col_001", "col_002", "col_003", "col_004", "col_005", "col_006", "col_007", "col_008", "col_009"  )  AS "count"
  FROM
   "d"
 )
SELECT  -- .select_rows('count > 1')
 "col_000" ,
 "col_001" ,
 "col_002" ,
 "col_003" ,
 "col_004" ,
 "col_005" ,
 "col_006" ,
 "col_007" ,
 "col_008" ,
 "col_009"
FROM
 "extend_0"
WHERE
 "count" > 1



In [20]:
db_tables.close()