In [13]:
from dataclasses import dataclass
import re
from typing import Callable
import os
import json
import numpy as np
import pandas as pd


SAMPLE_DIR = './sample/samples-1000'


# Data transformation (temporary since it should be treated by another service)

### Files without outliers (from data-exploration.ipynb)

In [4]:
@dataclass
class PatternConverter:
    pattern: re.Pattern
    conversion_factor: Callable[[str], float]


chaos_pattern = PatternConverter(re.compile(r'^~price \d+ chaos$'), lambda s: int(s.split()[1]))
bo_chaos_pattern = PatternConverter(re.compile(r'^~b/o \d+ chaos$'), lambda s: int(s.split()[1]))
divine_pattern = PatternConverter(re.compile(r'^~price \d+ divine$'), lambda s: int(s.split()[1]) * 165)
bo_divine_pattern = PatternConverter(re.compile(r'^~b/o \d+ divine$'), lambda s: int(s.split()[1]) * 165)
chaos_veiled_pattern = PatternConverter(re.compile(r'^~price \d+ chaos veiled$'), lambda s: int(s.split()[1]) * 74.5)
bo_chaos_veiled_pattern = PatternConverter(re.compile(r'^~b/o \d+ chaos veiled$'), lambda s: int(s.split()[1]) * 74.5)

def match_any_patten_and_convert(string: str) -> float | None:
    for pattern in [chaos_pattern, bo_chaos_pattern, divine_pattern, bo_divine_pattern, chaos_veiled_pattern, bo_chaos_veiled_pattern]:
        if pattern.pattern.match(string):
            return pattern.conversion_factor(string)
    return None

In [7]:
@dataclass
class PriceWithFileName:
    name: str
    price_note: str
    price: float | None

item_description = []
for file in os.listdir(SAMPLE_DIR):
    if file.endswith('.json'):
        with open(os.path.join(SAMPLE_DIR, file)) as f:
            data = json.load(f)
            item_description.append(PriceWithFileName(file, data['note'], match_any_patten_and_convert(data['note'])))

item_description_filtered_by_known_price = [item for item in item_description if item.price is not None]

item_prices = np.array([item.price for item in item_description_filtered_by_known_price])

quantile_1 = np.percentile(item_prices, 25)
quantile_3 = np.percentile(item_prices, 75)

inter_quantile_range = quantile_3 - quantile_1

lower_bound = quantile_1 - 1.5 * inter_quantile_range
upper_bound = quantile_3 + 1.5 * inter_quantile_range

item_description_with_known_price_without_outlier = [
    item
    for item in item_description_filtered_by_known_price
    if lower_bound <= item.price <= upper_bound
]
item_description_with_known_price_without_outlier

[PriceWithFileName(name='f3861a3209a17af1e52e975fec04ab806f7ee7b549d62eb957d3663d9eb9692b.json', price_note='~b/o 4 chaos', price=4),
 PriceWithFileName(name='2980dc39bd25c9651fc3a89d43c07bedc158c8142e376ed2068aefde9800dce6.json', price_note='~price 4 chaos', price=4),
 PriceWithFileName(name='7c64b68cdf2d852beae8c519ce7f85f7a66c3b9a39590f2d2bc3a124d757981a.json', price_note='~price 50 chaos', price=50),
 PriceWithFileName(name='705122ad9a23ee31018f5992ab5407d9a6dbd8e7c3f1be28866c16f0036274b8.json', price_note='~b/o 58 chaos', price=58),
 PriceWithFileName(name='ede29fa2b5117c51ac1a8df3558001c966d2e3c2cb003867c277a804a084e6ca.json', price_note='~price 10 chaos', price=10),
 PriceWithFileName(name='ee791acb87c0a588decac34cefa0a75b97c680fe6944a0a3b8ee4fae3af8d726.json', price_note='~price 10 chaos', price=10),
 PriceWithFileName(name='e4a8d4df42a5ef6a4f3418249f373b2f9610a73ac316e7f85c63db382884b825.json', price_note='~price 9 chaos', price=9),
 PriceWithFileName(name='248c6ce8bc85b7278ec

In [8]:
len(item_description_with_known_price_without_outlier)

849

## Transforming json into list of integers

In [11]:
filtered_file_name = [item.name for item in item_description_with_known_price_without_outlier]

items = []
for file in filtered_file_name:
    with open(os.path.join(SAMPLE_DIR, file)) as f:
        data = json.load(f)
        items.append(data)

### rarity

In [15]:
rarities = [item["rarity"] for item in items]
uniq_rarities = set(rarities)
uniq_rarities

{'Magic', 'Rare'}

In [17]:
rarity_distribution = pd.Series(rarities).value_counts().sort_index()
rarity_distribution

Magic     18
Rare     831
Name: count, dtype: int64

In [18]:
def _infer_rarity(item: dict) -> list[int] | None:
    if "rarity" not in item:
        return None
    match item["rarity"]:
        case "Rare":
            return [1, 0]
        case "Magic":
            return [0, 1]
        case _:
            return None

### item level

In [20]:
ilvls = [item["ilvl"] for item in items]
uniq_ilvls = set(ilvls)
uniq_ilvls

{45,
 46,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87}

In [21]:
ilvl_distribution = pd.Series(ilvls).value_counts().sort_index()
ilvl_distribution

45      6
46      1
48      1
49      8
50      3
51      3
52      1
53      3
54      4
55      3
56      5
57      2
58      2
59      5
61      3
62      4
63     10
64      3
65      9
66      2
67     14
68     14
69     14
70     27
71     48
72     40
73     43
74     49
75     33
76     32
77     30
78     43
79     25
80     34
81     34
82     39
83     90
84     31
85    124
86      6
87      1
Name: count, dtype: int64

In [22]:
def _ilvl(item: dict) -> list[int] | None:
    if "ilvl" not in item:
        return None
    return [item["ilvl"]]

### mods

In [29]:
explicits_mods = [item_not_none for item in items if (item_not_none := item["explicitMods"]) is not None]
explicits_mods

[['+13 to all Attributes',
  'Adds 8 to 13 Physical Damage to Attacks',
  '+9% to Global Critical Strike Multiplier',
  '+50 to maximum Mana',
  '+8% to Lightning Resistance'],
 ['+17 to Armour',
  '+43 to maximum Energy Shield',
  '+9% to Cold Resistance',
  '5% increased Flask Effect Duration'],
 ['Adds 1 to 2 Cold Damage to Attacks',
  'Adds 1 to 14 Lightning Damage to Attacks',
  '+91 to Evasion Rating'],
 ['+44 to Dexterity',
  '+34 to Armour',
  '+88 to maximum Life',
  '+29% to Fire Resistance'],
 ['15% increased Global Accuracy Rating',
  '+26 to maximum Life',
  '+10% to all Elemental Resistances',
  '+32% to Lightning Resistance',
  '10% increased Light Radius'],
 ['+25 to Strength',
  '+137 to Armour',
  '+48 to maximum Energy Shield',
  '+41 to maximum Life',
  '21% increased Stun Duration on Enemies',
  '6% increased Flask Charges gained'],
 ['Adds 1 to 5 Lightning Damage to Attacks',
  'Regenerate 2.7 Life per second',
  '+12% to all Elemental Resistances'],
 ['Adds 4 to 

In [30]:
flatten_explicit_mods = [mod for mods in explicits_mods for mod in mods]
flatten_explicit_mods

['+13 to all Attributes',
 'Adds 8 to 13 Physical Damage to Attacks',
 '+9% to Global Critical Strike Multiplier',
 '+50 to maximum Mana',
 '+8% to Lightning Resistance',
 '+17 to Armour',
 '+43 to maximum Energy Shield',
 '+9% to Cold Resistance',
 '5% increased Flask Effect Duration',
 'Adds 1 to 2 Cold Damage to Attacks',
 'Adds 1 to 14 Lightning Damage to Attacks',
 '+91 to Evasion Rating',
 '+44 to Dexterity',
 '+34 to Armour',
 '+88 to maximum Life',
 '+29% to Fire Resistance',
 '15% increased Global Accuracy Rating',
 '+26 to maximum Life',
 '+10% to all Elemental Resistances',
 '+32% to Lightning Resistance',
 '10% increased Light Radius',
 '+25 to Strength',
 '+137 to Armour',
 '+48 to maximum Energy Shield',
 '+41 to maximum Life',
 '21% increased Stun Duration on Enemies',
 '6% increased Flask Charges gained',
 'Adds 1 to 5 Lightning Damage to Attacks',
 'Regenerate 2.7 Life per second',
 '+12% to all Elemental Resistances',
 'Adds 4 to 10 Physical Damage to Attacks',
 '+14 

In [32]:
mod_without_numbers = [re.sub(r'\d+(\.\d+)?', '', mod) for mod in flatten_explicit_mods]
mod_without_numbers_frequency = pd.Series(mod_without_numbers).value_counts().sort_values(ascending=False)
mod_without_numbers_frequency

+ to maximum Energy Shield                                      261
+ to maximum Mana                                               236
+ to maximum Life                                               227
+ to Strength                                                   176
+% to Cold Resistance                                           164
                                                               ... 
% increased Projectile Attack Damage during any Flask Effect      1
Remove Ignite and Burning when you use a Flask                    1
Gain % of Physical Damage as Extra Lightning Damage               1
Projectiles Pierce an additional Target                           1
Grants Level  Anger Skill                                         1
Name: count, Length: 110, dtype: int64