Skip to content

Commit

Permalink
Merge pull request #10 from Vivswan/main
Browse files Browse the repository at this point in the history
v1.0.0
  • Loading branch information
Vivswan committed Jun 2, 2023
2 parents 82e932b + 0aafeb0 commit 8714883
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 26 deletions.
9 changes: 4 additions & 5 deletions .github/workflows/unittest.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
name: Python package
name: Unittest

on:
push:
branches:
- master
- main
- release
branches: [ master, main, release ]
pull_request:
branches: [ master, main, release ]

jobs:
build:
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 1.0.1

* Optimizations and remove redundant code

## 1.0.0

* Public release
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# DeDuplicationDict

[![PyPI version](https://badge.fury.io/py/deduplicationdict.svg)](https://badge.fury.io/py/deduplicationdict)
[![Python package](https://github.com/Vivswan/DeDuplicationDict/actions/workflows/unittest.yaml/badge.svg)](https://github.com/Vivswan/DeDuplicationDict/actions/workflows/unittest.yaml)
[![Documentation Status](https://readthedocs.org/projects/deduplicationdict/badge/?version=release)](https://deduplicationdict.readthedocs.io/en/release/?badge=release)
[![Python](https://img.shields.io/badge/python-3.7--3.12-blue)](https://badge.fury.io/py/deduplicationdict)
[![License: MPL 2.0](https://img.shields.io/badge/License-MPL_2.0-blue.svg)](https://opensource.org/licenses/MPL-2.0)
Expand Down Expand Up @@ -60,6 +61,9 @@ assert dedup_dict["e"] == [1, 2, 3]
assert DeDuplicationDict.from_json_save_dict(dedup_dict.to_json_save_dict()).to_dict() == dedup_dict.to_dict()
```

Usage
with [SqliteDict](https://github.com/RaRe-Technologies/sqlitedict): [SqliteDeDuplicationDict.py](https://gist.github.com/Vivswan/6fca547b2927e0bf11743869058d4b10)

## Results from Testing

| Method | JSON Memory (MB) | In-Memory (MB) |
Expand All @@ -70,6 +74,7 @@ assert DeDuplicationDict.from_json_save_dict(dedup_dict.to_json_save_dict()).to_
| _Memory Saving_ | **7.868x** | **7.235x** |

[//]: # (![dict vs DeDuplicationDict](https://github.com/Vivswan/DeDuplicationDict/raw/release/docs/_static/dict_vs_DeDuplicationDict.svg))

![dict vs DeDuplicationDict](docs/_static/dict_vs_DeDuplicationDict.svg)

## Documentation
Expand Down
61 changes: 47 additions & 14 deletions deduplicationdict/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,14 @@ def __init__(self, *args, _value_dict: dict = None, **kwargs):
for k, v in dict(*args, **kwargs).items():
self[k] = v

def _set_value_dict(self, value_dict: dict) -> None:
def _set_value_dict(self, value_dict: dict) -> DeDuplicationDict:
"""Update the value dictionary and propagate the changes to nested DeDuplicationDict instances.
Args:
value_dict (dict): The new value dictionary to use for deduplication.
Return:
DeDuplicationDict: self
"""

value_dict.update(self.value_dict)
Expand All @@ -86,6 +89,8 @@ def _set_value_dict(self, value_dict: dict) -> None:

v._set_value_dict(value_dict)

return self

def __setitem__(self, key: KT, value: VT) -> None:
"""Set the value for the given key, deduplicating the value if necessary.
Expand All @@ -105,7 +110,9 @@ def __setitem__(self, key: KT, value: VT) -> None:
else:
hash_id = sha256(pickle.dumps(value)).hexdigest()[:self.hash_length]
self.key_dict[key] = hash_id
self.value_dict[hash_id] = value

if hash_id not in self.value_dict:
self.value_dict[hash_id] = value

def __getitem__(self, key: KT) -> VT_co:
"""Get the value for the given key.
Expand Down Expand Up @@ -148,15 +155,21 @@ def all_hashes_in_use(self) -> set:
all_hashes_in_use.update(v.all_hashes_in_use())
return all_hashes_in_use

def clean_up(self) -> None:
"""Remove unused hash values from the value dictionary."""
def clean_up(self) -> DeDuplicationDict:
"""Remove unused hash values from the value dictionary.
Return:
DeDuplicationDict: self
"""

all_hashes_in_use = self.all_hashes_in_use()
all_hashes = set(self.value_dict.keys())
not_in_use = all_hashes - all_hashes_in_use
for hash_id in not_in_use:
del self.value_dict[hash_id]

return self

def detach(self) -> DeDuplicationDict:
"""Detach the DeDuplicationDict instance from its value dictionary, creating a standalone instance.
Expand All @@ -166,15 +179,21 @@ def detach(self) -> DeDuplicationDict:

return self.from_json_save_dict(self.to_json_save_dict())

def _del_detach(self) -> None:
"""Detach the DeDuplicationDict instance from its value dictionary and clean up unused hash values."""
def _del_detach(self) -> DeDuplicationDict:
"""Detach the DeDuplicationDict instance from its value dictionary and clean up unused hash values.
Return:
DeDuplicationDict: self
"""

self._set_value_dict({})
self.clean_up()

for k, v in self.value_dict.items():
self.value_dict[k] = copy.deepcopy(v)

return self

def __delitem__(self, key: KT) -> None:
"""Delete the item with the given key.
Expand Down Expand Up @@ -266,6 +285,27 @@ def to_json_save_dict(self) -> dict:
'value_dict': self.value_dict
}

def _set_key_dict(self, key_dict: dict) -> DeDuplicationDict:
"""Set the key dictionary of the DeDuplicationDict instance from a normal dictionary format.
Args:
key_dict (dict): The key dictionary to set.
Returns:
DeDuplicationDict: self
"""

for k, v in key_dict.items():
if isinstance(v, dict):
new_dict = self.__class__()
new_dict.value_dict = self.value_dict
new_dict._set_key_dict(v)
self.key_dict[k] = new_dict
else:
self.key_dict[k] = v

return self

@classmethod
def from_json_save_dict(cls, d: dict, _v: dict = None) -> DeDuplicationDict:
"""Create a DeDuplicationDict instance from a dictionary that was saved to a JSON file.
Expand All @@ -282,13 +322,6 @@ def from_json_save_dict(cls, d: dict, _v: dict = None) -> DeDuplicationDict:
return cls.from_json_save_dict(d['key_dict'], _v=d['value_dict'])

new_dict = cls()
new_dict.key_dict = {}
new_dict.value_dict = _v

for k, v in d.items():
if isinstance(v, dict):
new_dict.key_dict[k] = cls.from_json_save_dict(v, _v=_v)
else:
new_dict.key_dict[k] = v

new_dict._set_key_dict(d)
return new_dict
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ where = ["deduplicationdict"]
[project]
# $ pip install deduplicationdict
name = "deduplicationdict"
version = "1.0.0"
version = "1.0.1"
description = "A dictionary that de-duplicates values."
readme = "README.md"
requires-python = ">=3.7"
Expand Down
File renamed without changes.
File renamed without changes.
44 changes: 38 additions & 6 deletions unit_test/test_main.py → tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,16 +258,48 @@ def test_from_dict(self):
json_data = json.dumps(data, sort_keys=True)
self.assertEqual(json_dd_dict, json_data)

def test_get_key_dict(self):
"""Test the _get_key_dict method of the DeDuplicationDict class."""

data = get_json_test_data()
dd_dict = DeDuplicationDict(**data)
key_dict = dd_dict._get_key_dict()
to_visit = [(data, key_dict)]
while to_visit:
data, key_dict = to_visit.pop()
for k, v in data.items():
self.assertIn(k, key_dict)

if not isinstance(v, dict):
continue

to_visit.append((v, key_dict[k]))

def test_set_key_dict(self):
"""Test the _set_key_dict method of the DeDuplicationDict class."""

data = get_json_test_data()
dd_dict = DeDuplicationDict(**data)
dd_dict2 = DeDuplicationDict()
dd_dict2.value_dict = dd_dict.value_dict
dd_dict2._set_key_dict(dd_dict._get_key_dict())

json_data = json.dumps(data, sort_keys=True)
json_dd_dict = json.dumps(dd_dict.to_dict(), sort_keys=True)
json_dd_dict2 = json.dumps(dd_dict2.to_dict(), sort_keys=True)
self.assertEqual(json_data, json_dd_dict)
self.assertEqual(json_data, json_dd_dict2)

def test_size_compression(self):
"""Test the size compression of the DeDuplicationDict class."""

data = get_json_test_data()
dd_dict = DeDuplicationDict(**data)
size_dd_dict = get_size(dd_dict)
size_data = get_size(data)
print(f'size_dd_dict: {size_dd_dict}')
print(f'size_data: {size_data}')
print(f'size_dd_dict / size_data: {size_dd_dict / size_data}')
print(f'size_dd_dict: {size_dd_dict / 1024 / 1024:0.3f} MB')
print(f'size_data: {size_data / 1024 / 1024:0.3f} MB')
print(f'size reduction: {size_data / size_dd_dict:0.3f}x')
self.assertLessEqual(size_dd_dict, size_data)

def test_json_size_compression(self):
Expand All @@ -277,9 +309,9 @@ def test_json_size_compression(self):
dd_dict = DeDuplicationDict(**data)
size_dd_json = get_size(json.dumps(dd_dict.to_json_save_dict()))
size_json = get_size(json.dumps(data))
print(f'size_dd_json: {size_dd_json}')
print(f'size_json: {size_json}')
print(f'size_dd_json / size_json: {size_dd_json / size_json}')
print(f'size_dd_json: {size_dd_json / 1024 / 1024:0.3f} MB')
print(f'size_json: {size_json / 1024 / 1024:0.3f} MB')
print(f'size reduction: {size_json / size_dd_json:0.3f}x')
self.assertLessEqual(size_dd_json, size_json)

def test_cache_dict(self):
Expand Down

0 comments on commit 8714883

Please sign in to comment.