**Importing Libraries**

In [104]:
import pandas as pd
import numpy as np
import random

# Definindo a semente para reprodução
np.random.seed(42)
random.seed(42)

**Reading Files**

In [105]:
magic_items = pd.read_csv('data/magic_items.csv', sep=';')
adventure_gear = pd.read_csv('data/adventuring_gear.csv', sep=';')
armor = pd.read_csv('data/armor.csv', sep=';')
poisons = pd.read_csv('data/poisons.csv', sep=';')
potions = pd.read_csv('data/potions.csv', sep=';')
weapons = pd.read_csv('data/weapons.csv', sep=';')
names = pd.read_csv('data/names.csv', sep=';')

In [106]:
magic_items.head()

Unnamed: 0,item_id,Name,Price,Rarity,Category
0,001-ACo,Ammunition +1 (Per),15 gp,Uncommon,Consumable Items
1,002-ACo,Ammunition +2 (Per),50 gp,Rare,Consumable Items
2,003-ACo,Ammunition +3 (Per),250 gp,Very Rare,Consumable Items
3,004-ACo,Arrow of Slaying,400 gp,Very Rare,Consumable Items
4,005-BCo,Bead of Force,"1,000 gp",Rare,Consumable Items


In [107]:
adventure_gear.head()

Unnamed: 0,item_id,Name,Price,Weight,type
0,01-Ars,Abacus,2 gp,2 lb.,Others
1,02-Ars,Acid (vial),25 gp,1 lb.,Others
2,03-Ars,Alchemist's Fire (flask),50 gp,1 lb.,Others
3,04-Aon,Arrows (20),1 gp,1 lb.,Ammunition
4,05-Bon,Blowgun Needle (50),1 gp,1 lb.,Ammunition


In [108]:
weapons.head()

Unnamed: 0,item_id,Name,Price,Damage,Weight,Properties,type
0,01-Cns,Club,1 sp,1d4 Bludgeon,2 lb.,Light,Simple Melee Weapons
1,02-Dns,Dagger,2 gp,1d4 Piercing,1 lb.,"Finesse, Light, Thrown (20/60)",Simple Melee Weapons
2,03-Gns,Greatclub,2 sp,1d8 Bludgeon,10 lb.,Two-handed,Simple Melee Weapons
3,04-Hns,Handaxe,5 gp,1d6 Slashing,2 lb.,"Light, Thrown (20/60)",Simple Melee Weapons
4,05-Jns,Javelin,5 sp,1d6 Piercing,2 lb.,Thrown (30/120),Simple Melee Weapons


In [109]:
armor.head()

Unnamed: 0,item_id,Name,Price,AC,Weight,Requirements,Stealth,type
0,01-Por,Padded,5 gp,11 + Dex,8 lb.,,Disadvantage,Light Armor
1,02-Lor,Leather,10 gp,11 + Dex,10 lb.,,,Light Armor
2,03-Sor,Studded Leather,45 gp,12 + Dex,13 lb.,,,Light Armor
3,04-Hor,Hide,10 gp,12 + Dex(max2),12 lb.,,,Medium Armor
4,05-Cor,Chain Shirt,50 gp,13 + Dex(max2),20 lb.,,,Medium Armor


In [110]:
potions.head()

Unnamed: 0,item_id,Name,Price,Rarity
0,01-Pon,Potion of Healing,50 gp,Common
1,02-Pon,Potion of Greater Healing,150 gp,Uncommon
2,03-Pre,Potion of Superior Healing,450 gp,Rare
3,04-Pre,Potion of Supreme Healing,"1,350 gp",Very Rare
4,05-Ere,Elixir of Health,120 gp,Rare


In [111]:
poisons.head()

Unnamed: 0,item_id,Name,Price,Type,DC
0,01-Aed,Assassin's blood,150 gp,Ingested,10.0
1,02-Ted,Truth serum,150 gp,Ingested,11.0
2,03-Cct,Carrion crawler mucus,200 gp,Contact,13.0
3,04-Dry,Drow poison,200 gp,Injury,13.0
4,05-Sry,Serpent venom,200 gp,Injury,11.0


****

# **Data Creation**

**Passos:**

1. Filtrar as Raridades dos itens
1. Criar uma tabela que contenha todos os produtos
    * A tabela deve ter apenas informações básicas sobre os itens, como: id | nome | preço | tipo.
1. Criar uma tabela com as informações dos clientes
1. Criar a tabela fato de **vendas**




### **Filtrar Raridades:**

In [112]:
magic_items['Rarity'].value_counts()

Rare         114
Uncommon      83
Very Rare     63
Legendary     38
Common         2
Name: Rarity, dtype: int64

Como eu pretendo simular as vendas de um vendedor comum, não vou usar itens que sejam: Very Rare, Legendary.

In [113]:
# função para filtrar a raridade dos items
def filter_rarity(dataframe):
    unwanted = ['Very Rare', 'Legendary']
    df = dataframe.copy()
    df = df.query("Rarity not in @unwanted")
    return df

In [114]:
magic_items_filtered = filter_rarity(magic_items)

In [115]:
magic_items_filtered['Rarity'].value_counts()

Rare        114
Uncommon     83
Common        2
Name: Rarity, dtype: int64

In [116]:
potions_filtered =  filter_rarity(potions)

In [117]:
potions_filtered['Rarity'].value_counts()

Uncommon    10
Rare        10
Common       2
Name: Rarity, dtype: int64

Nenhum outro Dataset tem a coluna 'Rarity'

### **Criar tabela Produtos**

Primeiramente devo mudar as colunas 'type' para 'cetegory', depois criar uma nova coluna 'type' com o tipo de item de cada dataset.

In [118]:
# função para substituir a coluna type e colocar os nomes das colunas em minúsculo
def replace_lower_column(df):
    df = df.columns.str.lower().str.replace('type', 'category')
    return df

In [119]:
adventure_gear.columns = replace_lower_column(adventure_gear)
magic_items_filtered.columns = replace_lower_column(magic_items_filtered)
armor.columns = replace_lower_column(armor)
weapons.columns = replace_lower_column(weapons)
potions_filtered.columns =  replace_lower_column(potions_filtered)
poisons.columns = replace_lower_column(poisons)

**Criando as novas colunas 'type':**

In [120]:
adventure_gear['type'] = 'adventure_gear'
magic_items_filtered['type'] = 'magic_item'
weapons['type'] = 'weapon'
potions_filtered['type'] = 'potion'
poisons['type'] = 'poison'
armor['type'] = 'armor'

**Criando a tabela Produtos:**

| id | nome | preço | tipo |


In [121]:
wanted_cols = ['item_id', 'name', 'price', 'type']

In [122]:
product = magic_items_filtered[wanted_cols].copy()

In [123]:
product = pd.concat([product,
                     adventure_gear[wanted_cols],
                     weapons[wanted_cols],
                     armor[wanted_cols],
                     potions_filtered[wanted_cols],
                     poisons[wanted_cols]],
                     ignore_index=True)

In [124]:
product['type'].value_counts()

magic_item        199
adventure_gear    109
weapon             37
potion             22
poison             16
armor              13
Name: type, dtype: int64

In [125]:
product.head()

Unnamed: 0,item_id,name,price,type
0,001-ACo,Ammunition +1 (Per),15 gp,magic_item
1,002-ACo,Ammunition +2 (Per),50 gp,magic_item
2,005-BCo,Bead of Force,"1,000 gp",magic_item
3,006-CCo,Chime of Opening,400 gp,magic_item
4,007-DCo,Deck of Illusions,900 gp,magic_item


In [126]:
# testando o relacionamento das tabelas
MI_itemID =  random.choices(product['item_id'].loc[product['type'] == 'magic_item'])
magic_items_filtered.query('item_id == @MI_itemID')

Unnamed: 0,item_id,name,price,rarity,category,type
192,193-ICo,Ioun Stone Awareness,500 gp,Rare,Combat Items,magic_item


O relacionamento está funcionando como o desejado.

## **Criar a Tabela Cliente**

**Atributos do cliente:**


* id
* nome
* sexo
* idade
* raça
* classe
* endereço (cidade) (talvez)
* contato (talvez)

In [127]:
# checando duplicatas
names.duplicated().sum()

101

In [128]:
names.drop_duplicates(inplace=True)

In [129]:
names.shape

(1006, 2)

referência = https://bg3.wiki/wiki/Races

In [130]:
races = pd.read_csv('data/races.csv', sep=';')
races

Unnamed: 0,race,base_age,max_age,maximum_age_range
0,Dragonborn,15,80,80 + 1d20
1,Drow,80,225,225 + 3d100
2,Dwarf,50,350,300 + 2d100
3,Elf,90,750,425 + 5d100
4,Githyanki,30,250,250 + 1d100
5,Gnome,60,200,200 + 3d100
6,Half-Elf,15,125,125 + 3d20
7,Halfling,20,100,100 + 1d100
8,Half-Orc,12,60,90 + 2d20
9,Human,15,90,90 + 2d20


referência: https://www.dndbeyond.com/sources/basic-rules/classes#ClassesSummary

In [131]:
classes = pd.read_csv('data\classes.csv', sep=';')
classes

Unnamed: 0,Class,Armor,Weapon
0,Barbarian,"Light armor, medium armor, shields","Simple weapons, martial weapons"
1,Bard,Light armor,"simple weapons, hand crossbows, longswords, ra..."
2,Cleric,"Light armor, medium armor, shields",Simple weapons
3,Druid,"Light armor, medium armor, shields","Clubs, daggers, darts, javelins, maces, quarte..."
4,Fighter,"Light armor, medium armor, heavy armor, shields","Simple weapons, martial weapons"
5,Monk,,"Simple weapons, shortswords"
6,Paladin,"Light armor, medium armor, heavy armor, shields","Simple weapons, martial weapons"
7,Ranger,"Light armor, medium armor, shields","Simple weapons, martial weapons"
8,Rogue,Light armor,"Simple weapons, hand crossbows, longswords, ra..."
9,Sorcerer,,"Daggers, darts, slings, quarterstaffs, light c..."


A tabela de cliente deve ter a seguinte estrutura:

| customer_id| name    | sex      | race     | age     | class   |
| -------- | ------- | -------- | ------- | -------- | ------- |
| -------- |-------- | -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |

In [132]:
import time
import string

# criando função para gerar IDs
def generate_custom_id():
    """Function to generate random IDs"""
    timestamp = str(int(time.time()))
    random_chars = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))
    return f"{timestamp}-{random_chars}"   # return new unique ID

In [133]:
# random ID
custom_id = generate_custom_id()
print(custom_id)

1701894440-AJI0Y6


In [134]:
# criando um dataframe com os randoms IDs
customers = pd.DataFrame({
    'customer_id': [generate_custom_id() for i in range(1, names.shape[0] + 1)]})
customers

Unnamed: 0,customer_id
0,1701894440-DPBHSA
1,1701894440-HXTHV3
2,1701894440-A3ZMF8
3,1701894440-MDD4V3
4,1701894440-0T9NT3
...,...
1001,1701894440-3YKYYD
1002,1701894440-U3ASNR
1003,1701894440-M6DFHG
1004,1701894440-6W3STT


In [135]:
# checando IDs duplicados
customers.duplicated().sum()

0

In [136]:
names.isna().sum()

name    0
sex     0
dtype: int64

In [137]:
# resetando os índices
names.reset_index(drop=True, inplace=True)

In [138]:
# inserindo os nomes e sexos na tabela de clientes
customers["name"] = names['name']
customers["sex"] = names['sex']

In [139]:
customers

Unnamed: 0,customer_id,name,sex
0,1701894440-DPBHSA,Veklani Daargen,female
1,1701894440-HXTHV3,Kasaki Wygarthe,female
2,1701894440-A3ZMF8,Rosalyn Faringray,female
3,1701894440-MDD4V3,Atalya Webb,female
4,1701894440-0T9NT3,Grenenzel Lyfalia,female
...,...,...,...
1001,1701894440-3YKYYD,Jeras Serechor,male
1002,1701894440-U3ASNR,Ningyan Ronefel,male
1003,1701894440-M6DFHG,Feck Coyle,male
1004,1701894440-6W3STT,Steveren Arkalis,male


In [140]:
races['race'].unique()

array(['Dragonborn\xa0', 'Drow', 'Dwarf', 'Elf', 'Githyanki', 'Gnome',
       'Half-Elf', 'Halfling', 'Half-Orc', 'Human', 'Tiefling'],
      dtype=object)

In [141]:
# limpando a coluna 'race'
races['race'] = races['race'].str.replace('\xa0', '').str.strip()

In [142]:
customers['race'] = [random.choice(list(races['race'])) for _ in range(names.shape[0])]

Agora vem uma etapa não tão simples, a idade, cada raça tem um período de vida diferente e pretendo conservar essas características também.

In [143]:
# criando a função que irá gerar a idade aleatória baseada da raça que foi dada
def random_age(X):
    """This function returns a random age based on the race"""
    race_infos = races[races['race'] == X].reset_index(drop=True)
    return random.randint(race_infos['base_age'][0], race_infos['max_age'][0])

In [144]:
customers['age'] = customers['race'].apply(random_age)

Agora resta apenas escolher as classes para cada cliente.

In [145]:
customers['class'] = [random.choice(list(classes['Class'])) for _ in range(customers.shape[0])]

Conferindo a estrutura da tabela:

| customer_id| name    | sex      | race     | age     | class   |
| -------- | ------- | -------- | ------- | -------- | ------- |
| -------- |-------- | -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |

In [146]:
customers.sample(n=10, random_state=42)

Unnamed: 0,customer_id,name,sex,race,age,class
927,1701894440-3DR2H2,Veklani Shaulfer,female,Dragonborn,76,Ranger
630,1701894440-O2T8FY,Eandro Goldrudder,male,Dragonborn,23,Wizard
682,1701894440-BIAF92,Remora Coyle,female,Half-Orc,16,Ranger
514,1701894440-83EP9P,Thurlfara Bilger,female,Tiefling,22,Paladin
365,1701894440-18O1BJ,Fenton Van Gandt,male,Half-Orc,36,Wizard
655,1701894440-FCXHGN,Semil Hindergrass,male,Dwarf,303,Paladin
656,1701894440-ZEI3ZR,Tegan Cresthill,male,Dragonborn,61,Warlock
529,1701894440-WEBNVG,Vronwe Irongull,female,Half-Elf,59,Paladin
321,1701894440-MC97C7,Mirabel Irongull,female,Gnome,90,Barbarian
70,1701894440-A11DPG,Lokara Strong,female,Half-Orc,56,Sorcerer


## **Criar a tabela de Vendas**

In [147]:
from datetime import datetime, timedelta

**Minhas tabelas:**
* adventure_gear
* magic_items_filtered
* weapons
* potions_filtered
* poisons
* armor
* product

A tabela sales deve seguir a estrutura:

| sale_id  | date    | customer_id | product_id    | product_name | quantity| product_price | 
| -------- | ------- | ----------- | --------------| -------------| ------- |-------------- |
| -------- |-------- | ----------- | --------------| -------------| ------- |-------------- |
| -------- | --------| ----------- | --------------| -------------| ------- |-------------- |
| -------- | --------| ----------- | --------------| -------------| ------- |-------------- |

In [161]:
# criando função para gerar IDs para tabela sales
def generate_sale_id():
    """Function to generate random sale_IDs.
    
    This will return a random string with length = 11 chars"""
    timestamp = str(int(time.time_ns()))
    random_chars = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
    return f"{timestamp[13:]}-{random_chars}"   # return new unique ID

In [149]:
print("Exemplo de sale_id: ", generate_sale_id())

Exemplo de sale_id:  099400-GB9X


In [150]:
product['price'].unique()

array(['15 gp', '50 gp', '1,000 gp', '400 gp', '900 gp', '300 gp',
       '100 gp', '600 gp', '350 gp', '450 gp', '150 gp', '500 gp',
       '1,600 gp', '4,000 gp', '8,000 gp', '120 gp', '200 gp', '25 gp',
       '35 gp', '280 gp', '570 gp', '2,640 gp', '5,280 gp', '275 gp',
       '2,000 gp', '1,250 gp', '1,300 gp', '800 gp', '5,000 gp',
       '1,500 gp', '250 gp', '750 gp', '700 gp', '1,350 gp', '3,000 gp',
       '4,500 gp', '6,000 gp', '5,600 gp', '1,200 gp', '1,0500 gp',
       '550 gp', '3,500 gp', '2,0500 gp', 'Armor + 500gp',
       'Armor + 500 gp', 'Armor + 800 gp', 'Armor + 200 gp', '2,500 gp',
       '40,000 gp', '2 gp', '1 gp', '4 cp', '10 gp', '20 gp', '5 gp',
       '4 sp', '5 sp', '5 cp', '1 cp', nan, '2 cp', ' 5 gp', '1 sp',
       '2 sp', '4 gp', '1000 gp', '30 gp', '6 gp', '3 gp', '12 gp',
       '75 gp', '45 gp', '1500 gp', '450 gp ', '90 gp', '180 gp',
       '270 gp', '960 gp', '1,920 gp', '480 gp'], dtype=object)

Alguns preços possuem informações que estão despadrozinadas como "Armor + 200 gp", precisa ser limpados da tabela de produtos. 

In [151]:
# removendo observações no campo de preço
product['price'] = product['price'].str.replace('Armor + ', '', regex=False).str.strip()

In [152]:
product['price'].unique()

array(['15 gp', '50 gp', '1,000 gp', '400 gp', '900 gp', '300 gp',
       '100 gp', '600 gp', '350 gp', '450 gp', '150 gp', '500 gp',
       '1,600 gp', '4,000 gp', '8,000 gp', '120 gp', '200 gp', '25 gp',
       '35 gp', '280 gp', '570 gp', '2,640 gp', '5,280 gp', '275 gp',
       '2,000 gp', '1,250 gp', '1,300 gp', '800 gp', '5,000 gp',
       '1,500 gp', '250 gp', '750 gp', '700 gp', '1,350 gp', '3,000 gp',
       '4,500 gp', '6,000 gp', '5,600 gp', '1,200 gp', '1,0500 gp',
       '550 gp', '3,500 gp', '2,0500 gp', '500gp', '2,500 gp',
       '40,000 gp', '2 gp', '1 gp', '4 cp', '10 gp', '20 gp', '5 gp',
       '4 sp', '5 sp', '5 cp', '1 cp', nan, '2 cp', '1 sp', '2 sp',
       '4 gp', '1000 gp', '30 gp', '6 gp', '3 gp', '12 gp', '75 gp',
       '45 gp', '1500 gp', '90 gp', '180 gp', '270 gp', '960 gp',
       '1,920 gp', '480 gp'], dtype=object)

In [153]:
product

Unnamed: 0,item_id,name,price,type
0,001-ACo,Ammunition +1 (Per),15 gp,magic_item
1,002-ACo,Ammunition +2 (Per),50 gp,magic_item
2,005-BCo,Bead of Force,"1,000 gp",magic_item
3,006-CCo,Chime of Opening,400 gp,magic_item
4,007-DCo,Deck of Illusions,900 gp,magic_item
...,...,...,...,...
391,12-Wry,Wyvern poison,"1,200 gp",poison
392,13-Med,Midnight tears,"1,500 gp",poison
393,14-Pry,Purple worm poison,"2,000 gp",poison
394,15-Aed,Antitoxin,50 gp,poison


In [156]:
# Criando a tabela de datas para simular vendas ao longo do tempo
data_inicial = datetime(2017, 1, 1)
data_final = datetime(2023, 12, 31)
intervalo_datas = [data_inicial + timedelta(days=i) for i in range((data_final - data_inicial).days + 1)]

A data inicial foi inspirada na data inicial do desenvolvimento do jogo Baldur's Gate 3.

referência: https://larian.com/support/faqs/general-information_46

In [173]:
sales = pd.DataFrame({
          'sale_id': [generate_sale_id() for _ in range(50_832)],
          'date': [random.choice(intervalo_datas) for _ in range(50_832)],
          'customer_id': [random.choice(list(customers['customer_id'])) for _ in range(50_832)],
          'product_id': [random.choice(list(product['item_id'])) for _ in range(50_832)],
          'quantity': [random.randint(1, 5) for _ in range(50_832)]
          })

A quantidade de items comprados é baseado na quantidade base de itens inicias dos personagens nas campanhas  de Dungeons and Dragons.

A quantidade de vendas foi baseada no número de avaliações do ***Player's Handbook: Everything a Player Needs to Create Heroic Characters for the World's Greatest Roleplaying Game***.

Referência: https://www.amazon.com.br/Players-Handbook-Wizards-RPG-Team/dp/0786965606/ref=sr_1_7?keywords=dnd+5e&qid=1701899431&refinements=p_n_feature_nine_browse-bin%3A8529758011&rnid=8529757011&s=books&sr=1-7&ufe=app_do%3Aamzn1.fos.db68964d-7c0e-4bb2-a95c-e5cb9e32eb12

In [174]:
sales

Unnamed: 0,sale_id,date,customer_id,product_id,quantity
0,444700-XU8K6,2019-06-08,1701894440-ZKWDEE,33-Crs,3
1,444700-OY8GQ,2018-11-01,1701894440-JXRPJ1,01-Cns,4
2,444700-PRJQR,2019-05-26,1701894440-9TQVDZ,246-BCo,2
3,444700-GNB0R,2019-08-07,1701894440-IGQPKI,58-Lrs,2
4,444700-XPXST,2021-09-30,1701894440-JXJN6F,33-Bns,5
...,...,...,...,...,...
50827,572000-PCP4V,2022-06-27,1701894440-E3YQY5,97-Vrs,3
50828,572000-ENCIT,2017-10-19,1701894440-IRC9ZD,25-Crs,5
50829,572000-QZ2KX,2023-04-06,1701894440-HDK8LW,294-ICo,3
50830,572000-L3J6R,2018-05-06,1701894440-OY4DD1,085-FNo,1


In [190]:
# renomeando a coluna na tabela produto para dar match na tabela de vendas
product.rename(columns={'item_id': 'product_id',
                        'name': 'product_name'}, inplace=True)

In [193]:
# adiconando os preços e nomes dos produtos na tabela vendas
sales = pd.merge(sales, product[['product_id', 'price', 'product_name']], on='product_id', how='left')

In [194]:
# verificando uma amostra aleatória das vendas
sales.sample(n=10, random_state=42)

Unnamed: 0,sale_id,date,customer_id,product_id,quantity,price,product_name
18577,729900-S2NBK,2019-08-07,1701894440-FLZM76,12-Pre,2,180 gp,Potion of Mind Reading
37264,614600-90GIQ,2019-03-11,1701894440-2KKZMP,267-PCo,2,600 gp,Pipes of Haunting
46966,640700-SD3B6,2017-06-13,1701894440-VGNXB5,21-Pre,2,960 gp,Potion of Clairvoyance
38305,606000-V37W7,2020-01-04,1701894440-VJLZFF,202-SCo,3,"3,000 gp",Staff of Healing
24865,704500-N8BI8,2019-11-17,1701894440-U1G463,22-Brs,4,2 gp,"Bottle, Glass"
15132,764800-5BOU0,2018-09-20,1701894440-OZCYWD,294-ICo,1,"2,000 gp",Instrument of the Bards - Mac-Fuirmidh Cittern
43040,577600-UE417,2020-06-04,1701894440-0L7253,17-Pre,3,300 gp,Potion of Gaseous Form
14164,293800-TP2CJ,2023-03-20,1701894440-P9N56N,85-Srs,1,5 gp,"Scale, merchant's"
33394,605100-6KYGY,2023-05-04,1701894440-SPFDRW,12-Por,3,1500 gp,Plate
49037,582700-37BTY,2021-08-16,1701894440-RXD356,132-BSu,4,"4,000 gp",Bronze Griffon


A tabela está finalizada, porém antes de transformar em um DB, vou criar algumas inconsistências.

## **Criando incosistências na tabela de vendas:**

Motivação:
