**Importing Libraries**

In [363]:
import pandas as pd
import numpy as np
import random

# Definindo a semente para reprodução
np.random.seed(42)
random.seed(42)

**Reading Files**

In [364]:
magic_items = pd.read_csv('data/magic_items.csv', sep=';')
adventure_gear = pd.read_csv('data/adventuring_gear.csv', sep=';')
armor = pd.read_csv('data/armor.csv', sep=';')
poisons = pd.read_csv('data/poisons.csv', sep=';')
potions = pd.read_csv('data/potions.csv', sep=';')
weapons = pd.read_csv('data/weapons.csv', sep=';')
names = pd.read_csv('data/names.csv', sep=';')

In [365]:
magic_items.head()

Unnamed: 0,item_id,Name,Price,Rarity,Category
0,001-ACo,Ammunition +1 (Per),15 gp,Uncommon,Consumable Items
1,002-ACo,Ammunition +2 (Per),50 gp,Rare,Consumable Items
2,003-ACo,Ammunition +3 (Per),250 gp,Very Rare,Consumable Items
3,004-ACo,Arrow of Slaying,400 gp,Very Rare,Consumable Items
4,005-BCo,Bead of Force,"1,000 gp",Rare,Consumable Items


In [366]:
adventure_gear.head()

Unnamed: 0,item_id,Name,Price,Weight,type
0,01-Ars,Abacus,2 gp,2 lb.,Others
1,02-Ars,Acid (vial),25 gp,1 lb.,Others
2,03-Ars,Alchemist's Fire (flask),50 gp,1 lb.,Others
3,04-Aon,Arrows (20),1 gp,1 lb.,Ammunition
4,05-Bon,Blowgun Needle (50),1 gp,1 lb.,Ammunition


In [367]:
weapons.head()

Unnamed: 0,item_id,Name,Price,Damage,Weight,Properties,type
0,01-Cns,Club,1 sp,1d4 Bludgeon,2 lb.,Light,Simple Melee Weapons
1,02-Dns,Dagger,2 gp,1d4 Piercing,1 lb.,"Finesse, Light, Thrown (20/60)",Simple Melee Weapons
2,03-Gns,Greatclub,2 sp,1d8 Bludgeon,10 lb.,Two-handed,Simple Melee Weapons
3,04-Hns,Handaxe,5 gp,1d6 Slashing,2 lb.,"Light, Thrown (20/60)",Simple Melee Weapons
4,05-Jns,Javelin,5 sp,1d6 Piercing,2 lb.,Thrown (30/120),Simple Melee Weapons


In [368]:
armor.head()

Unnamed: 0,item_id,Name,Price,AC,Weight,Requirements,Stealth,type
0,01-Por,Padded,5 gp,11 + Dex,8 lb.,,Disadvantage,Light Armor
1,02-Lor,Leather,10 gp,11 + Dex,10 lb.,,,Light Armor
2,03-Sor,Studded Leather,45 gp,12 + Dex,13 lb.,,,Light Armor
3,04-Hor,Hide,10 gp,12 + Dex(max2),12 lb.,,,Medium Armor
4,05-Cor,Chain Shirt,50 gp,13 + Dex(max2),20 lb.,,,Medium Armor


In [369]:
potions.head()

Unnamed: 0,item_id,Name,Price,Rarity
0,01-Pon,Potion of Healing,50 gp,Common
1,02-Pon,Potion of Greater Healing,150 gp,Uncommon
2,03-Pre,Potion of Superior Healing,450 gp,Rare
3,04-Pre,Potion of Supreme Healing,"1,350 gp",Very Rare
4,05-Ere,Elixir of Health,120 gp,Rare


In [370]:
poisons.head()

Unnamed: 0,item_id,Name,Price,Type,DC
0,01-Aed,Assassin's blood,150 gp,Ingested,10.0
1,02-Ted,Truth serum,150 gp,Ingested,11.0
2,03-Cct,Carrion crawler mucus,200 gp,Contact,13.0
3,04-Dry,Drow poison,200 gp,Injury,13.0
4,05-Sry,Serpent venom,200 gp,Injury,11.0


****

# **Data Creation**

**Passos:**

1. Filtrar as Raridades dos itens
1. Criar uma tabela que contenha todos os produtos
    * A tabela deve ter apenas informações básicas sobre os itens, como: id | nome | preço | tipo.
1. Criar uma tabela com as informações dos clientes
1. Criar a tabela fato de **vendas**




### **Filtrar Raridades:**

In [371]:
magic_items['Rarity'].value_counts()

Rare         114
Uncommon      83
Very Rare     63
Legendary     38
Common         2
Name: Rarity, dtype: int64

Como eu pretendo simular as vendas de um vendedor comum, não vou usar itens que sejam: Very Rare, Legendary.

In [372]:
# função para filtrar a raridade dos items
def filter_rarity(dataframe):
    unwanted = ['Very Rare', 'Legendary']
    df = dataframe.copy()
    df = df.query("Rarity not in @unwanted")
    return df

In [373]:
magic_items_filtered = filter_rarity(magic_items)

In [374]:
magic_items_filtered['Rarity'].value_counts()

Rare        114
Uncommon     83
Common        2
Name: Rarity, dtype: int64

In [375]:
potions_filtered =  filter_rarity(potions)

In [376]:
potions_filtered['Rarity'].value_counts()

Uncommon    10
Rare        10
Common       2
Name: Rarity, dtype: int64

Nenhum outro Dataset tem a coluna 'Rarity'

### **Criar tabela Produtos**

Primeiramente devo mudar as colunas 'type' para 'cetegory', depois criar uma nova coluna 'type' com o tipo de item de cada dataset.

In [377]:
# função para substituir a coluna type e colocar os nomes das colunas em minúsculo
def replace_lower_column(df):
    df = df.columns.str.lower().str.replace('type', 'category')
    return df

In [378]:
adventure_gear.columns = replace_lower_column(adventure_gear)
magic_items_filtered.columns = replace_lower_column(magic_items_filtered)
armor.columns = replace_lower_column(armor)
weapons.columns = replace_lower_column(weapons)
potions_filtered.columns =  replace_lower_column(potions_filtered)
poisons.columns = replace_lower_column(poisons)

**Criando as novas colunas 'type':**

In [379]:
adventure_gear['type'] = 'adventure_gear'
magic_items_filtered['type'] = 'magic_item'
weapons['type'] = 'weapon'
potions_filtered['type'] = 'potion'
poisons['type'] = 'poison'
armor['type'] = 'armor'

**Criando a tabela Produtos:**

| id | nome | preço | tipo |


In [380]:
wanted_cols = ['item_id', 'name', 'price', 'type']

In [381]:
product = magic_items_filtered[wanted_cols].copy()

In [382]:
product = pd.concat([product,
                     adventure_gear[wanted_cols],
                     weapons[wanted_cols],
                     armor[wanted_cols],
                     potions_filtered[wanted_cols],
                     poisons[wanted_cols]],
                     ignore_index=True)

In [383]:
product['type'].value_counts()

magic_item        199
adventure_gear    108
weapon             37
potion             22
poison             16
armor              13
Name: type, dtype: int64

In [384]:
product.head()

Unnamed: 0,item_id,name,price,type
0,001-ACo,Ammunition +1 (Per),15 gp,magic_item
1,002-ACo,Ammunition +2 (Per),50 gp,magic_item
2,005-BCo,Bead of Force,"1,000 gp",magic_item
3,006-CCo,Chime of Opening,400 gp,magic_item
4,007-DCo,Deck of Illusions,900 gp,magic_item


In [385]:
# testando o relacionamento das tabelas
MI_itemID =  random.choices(product['item_id'].loc[product['type'] == 'magic_item'])
magic_items_filtered.query('item_id == @MI_itemID')

Unnamed: 0,item_id,name,price,rarity,category,type
192,193-ICo,Ioun Stone Awareness,500 gp,Rare,Combat Items,magic_item


O relacionamento está funcionando como o desejado.

## **Criar a Tabela Cliente**

**Atributos do cliente:**


* id
* nome
* sexo
* idade
* raça
* classe
* endereço (cidade) (talvez)
* contato (talvez)

In [386]:
# checando duplicatas
names.duplicated().sum()

140

In [387]:
names.drop_duplicates(inplace=True)

In [388]:
names.shape

(1423, 2)

referência = https://bg3.wiki/wiki/Races

In [389]:
races = pd.read_csv('data/races.csv', sep=';')
races

Unnamed: 0,race,base_age,max_age,maximum_age_range
0,Dragonborn,15,80,80 + 1d20
1,Drow,80,225,225 + 3d100
2,Dwarf,50,350,300 + 2d100
3,Elf,90,750,425 + 5d100
4,Githyanki,30,250,250 + 1d100
5,Gnome,60,200,200 + 3d100
6,Half-Elf,15,125,125 + 3d20
7,Halfling,20,100,100 + 1d100
8,Half-Orc,12,60,90 + 2d20
9,Human,15,90,90 + 2d20


referência: https://www.dndbeyond.com/sources/basic-rules/classes#ClassesSummary

In [390]:
classes = pd.read_csv('data\classes.csv', sep=';')
classes

Unnamed: 0,Class,Armor,Weapon
0,Barbarian,"Light armor, medium armor, shields","Simple weapons, martial weapons"
1,Bard,Light armor,"simple weapons, hand crossbows, longswords, ra..."
2,Cleric,"Light armor, medium armor, shields",Simple weapons
3,Druid,"Light armor, medium armor, shields","Clubs, daggers, darts, javelins, maces, quarte..."
4,Fighter,"Light armor, medium armor, heavy armor, shields","Simple weapons, martial weapons"
5,Monk,,"Simple weapons, shortswords"
6,Paladin,"Light armor, medium armor, heavy armor, shields","Simple weapons, martial weapons"
7,Ranger,"Light armor, medium armor, shields","Simple weapons, martial weapons"
8,Rogue,Light armor,"Simple weapons, hand crossbows, longswords, ra..."
9,Sorcerer,,"Daggers, darts, slings, quarterstaffs, light c..."


A tabela de cliente deve ter a seguinte estrutura:

| customer_id| name    | sex      | race     | age     | class   |
| -------- | ------- | -------- | ------- | -------- | ------- |
| -------- |-------- | -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |

**Função para gerar IDs:**

In [391]:
import time
import string

# criando função para gerar IDs
def generate_custom_id(i):
    """Function to generate random IDs"""
    timestamp = str(int(time.time()) + i)
    random_chars = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))
    return f"{timestamp[4:]}-{random_chars}"   # return new unique ID

In [392]:
for i in range(20):
    print(generate_custom_id(i))

983637-AJI0Y6
983638-DPBHSA
983639-HXTHV3
983640-A3ZMF8
983641-MDD4V3
983642-0T9NT3
983643-W5UZBI
983644-KCIDKW
983645-NNHJ7X
983646-VG0FN9
983647-XUY41I
983648-BLJH75
983649-LXO6QJ
983650-IUJV6O
983651-H9SDBD
983652-W2PCN9
983653-T84AZY
983654-TJXEPQ
983655-85JSG6
983656-5KXVF1


In [393]:
# random ID
custom_id = generate_custom_id(1)
print(custom_id)

983638-T2TALA


In [394]:
# criando um dataframe com os randoms IDs
customers = pd.DataFrame({
    'customer_id': [generate_custom_id(i) for i in range(1, names.shape[0] + 1)]})
customers

Unnamed: 0,customer_id
0,983638-753LC5
1,983639-8DRC11
2,983640-ERTJ5P
3,983641-HT0HL9
4,983642-XPSEIM
...,...
1418,985056-CTM09H
1419,985057-ZBGTJ5
1420,985058-4696QZ
1421,985059-6CP7SA


In [395]:
# checando IDs duplicados
customers.duplicated().sum()

0

In [396]:
# verificando NAs
names.isna().sum()

name    0
sex     0
dtype: int64

In [397]:
# removendo espaços antes e depois
names['name'] = names['name'].str.strip()

In [398]:
# resetando os índices
names.reset_index(drop=True, inplace=True)

In [399]:
# inserindo os nomes e sexos na tabela de clientes
customers["name"] = names['name']
customers["sex"] = names['sex']

In [400]:
customers

Unnamed: 0,customer_id,name,sex
0,983638-753LC5,Veklani Daargen,female
1,983639-8DRC11,Kasaki Wygarthe,female
2,983640-ERTJ5P,Rosalyn Faringray,female
3,983641-HT0HL9,Atalya Webb,female
4,983642-XPSEIM,Grenenzel Lyfalia,female
...,...,...,...
1418,985056-CTM09H,Arnan Ramcrown,male
1419,985057-ZBGTJ5,Tavon Stormchapel,male
1420,985058-4696QZ,Dodd Fryft,male
1421,985059-6CP7SA,Markus Shattermast,male


In [401]:
races['race'].unique()

array(['Dragonborn\xa0', 'Drow', 'Dwarf', 'Elf', 'Githyanki', 'Gnome',
       'Half-Elf', 'Halfling', 'Half-Orc', 'Human', 'Tiefling'],
      dtype=object)

In [402]:
# limpando a coluna 'race'
races['race'] = races['race'].str.replace('\xa0', '').str.strip()

In [403]:
customers['race'] = [random.choice(list(races['race'])) for _ in range(names.shape[0])]

Agora vem uma etapa não tão simples, a idade, cada raça tem um período de vida diferente e pretendo conservar essas características também.

In [404]:
# criando a função que irá gerar a idade aleatória baseada da raça que foi dada
def random_age(X):
    """This function returns a random age based on the race"""
    race_infos = races[races['race'] == X].reset_index(drop=True)
    return random.randint(race_infos['base_age'][0], race_infos['max_age'][0])

In [405]:
customers['age'] = customers['race'].apply(random_age)

Agora resta apenas escolher as classes para cada cliente.

In [406]:
customers['class'] = [random.choice(list(classes['Class'])) for _ in range(customers.shape[0])]

Conferindo a estrutura da tabela:

| customer_id| name    | sex      | race     | age     | class   |
| -------- | ------- | -------- | ------- | -------- | ------- |
| -------- |-------- | -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |
| -------- | --------| -------- | ------- | -------- | ------- |

In [407]:
customers.sample(n=10, random_state=42)

Unnamed: 0,customer_id,name,sex,race,age,class
1185,984823-ZAQ7QH,Vzani Van Hyden,female,Dwarf,337,Ranger
677,984315-5R96KH,Marnia Skandalor,female,Tiefling,37,Paladin
1084,984722-Q8VA8X,Lokara Talandro,female,Gnome,118,Sorcerer
1005,984643-40KZUJ,Sevenson Zereni,male,Elf,705,Monk
944,984582-MRSEZY,Talfen Ronefel,male,Half-Orc,23,Rogue
538,984176-7OI0E0,Vauldra Velene,female,Half-Orc,54,Wizard
994,984632-UDUAHG,Dalkon Romazi,male,Gnome,168,Cleric
724,984362-212PGY,Mirabel Yellowcrane,female,Half-Orc,26,Bard
70,983708-OZCYWD,Lokara Strong,female,Dragonborn,25,Barbarian
123,983761-2KNFTU,Meklan Van Devries,male,Elf,146,Cleric


## **Criar a tabela de Vendas**

In [269]:
from datetime import datetime, timedelta

**Minhas tabelas:**
* adventure_gear
* magic_items_filtered
* weapons
* potions_filtered
* poisons
* armor
* product
* costumers

A tabela sales deve seguir a estrutura:

| sale_id  | date    | customer_id | product_id    | product_name | quantity| product_price | 
| -------- | ------- | ----------- | --------------| -------------| ------- |-------------- |
| -------- |-------- | ----------- | --------------| -------------| ------- |-------------- |
| -------- | --------| ----------- | --------------| -------------| ------- |-------------- |
| -------- | --------| ----------- | --------------| -------------| ------- |-------------- |

**Função para gerar IDs:**

In [410]:
# criando função para gerar IDs para tabela sales
def generate_sale_id(i):
    """Function to generate random sale_IDs.
    
    This will return a random string with length = 11 chars"""
    timestamp = str(int(time.time_ns()) + i)
    random_chars = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
    return f"{timestamp[13:]}-{random_chars}"   # return new unique ID

In [411]:
print("Exemplo de sale_id: ", generate_sale_id(1))

Exemplo de sale_id:  883301-4WBAB


In [None]:
product['price'].unique()

Alguns preços possuem informações que estão despadrozinadas como "Armor + 200 gp", precisa ser limpados da tabela de produtos. 

In [413]:
# removendo observações no campo de preço
product['price'] = product['price'].str.replace('Armor + ', '', regex=False).str.strip()

In [415]:
product['price'].unique()

array(['15 gp', '50 gp', '1,000 gp', '400 gp', '900 gp', '300 gp',
       '100 gp', '600 gp', '350 gp', '450 gp', '150 gp', '500 gp',
       '1,600 gp', '4,000 gp', '8,000 gp', '120 gp', '200 gp', '25 gp',
       '35 gp', '280 gp', '570 gp', '2,640 gp', '5,280 gp', '275 gp',
       '2,000 gp', '1,250 gp', '1,300 gp', '800 gp', '5,000 gp',
       '1,500 gp', '250 gp', '750 gp', '700 gp', '1,350 gp', '3,000 gp',
       '4,500 gp', '6,000 gp', '5,600 gp', '1,200 gp', '1,0500 gp',
       '550 gp', '3,500 gp', '2,0500 gp', '500gp', '2,500 gp',
       '40,000 gp', '2 gp', '1 gp', '4 cp', '10 gp', '20 gp', '5 gp',
       '4 sp', '5 sp', '5 cp', '1 cp', '2 cp', '1 sp', '2 sp', '4 gp',
       '1000 gp', '30 gp', '6 gp', '3 gp', '12 gp', '75 gp', '45 gp',
       '1500 gp', '90 gp', '180 gp', '270 gp', '960 gp', '1,920 gp',
       '480 gp'], dtype=object)

In [None]:
product

In [276]:
# Criando a tabela de datas para simular vendas ao longo do tempo
data_inicial = datetime(2017, 1, 1)
data_final = datetime(2023, 12, 31)
intervalo_datas = [data_inicial + timedelta(days=i) for i in range((data_final - data_inicial).days + 1)]

A data inicial foi inspirada na data inicial do desenvolvimento do jogo Baldur's Gate 3.

referência: https://larian.com/support/faqs/general-information_46

In [416]:
sales = pd.DataFrame({
          'sale_id': [generate_sale_id(i) for i in range(50_832)],
          'date': [random.choice(intervalo_datas) for _ in range(50_832)],
          'customer_id': [random.choice(list(customers['customer_id'])) for _ in range(50_832)],
          'product_id': [random.choice(list(product['item_id'])) for _ in range(50_832)],
          'quantity': [random.randint(1, 5) for _ in range(50_832)]
          })

A quantidade de items comprados é baseado na quantidade base de itens inicias dos personagens nas campanhas  de Dungeons and Dragons.

A quantidade de vendas foi baseada no número de avaliações do ***Player's Handbook: Everything a Player Needs to Create Heroic Characters for the World's Greatest Roleplaying Game***.

Referência: https://www.amazon.com.br/Players-Handbook-Wizards-RPG-Team/dp/0786965606/ref=sr_1_7?keywords=dnd+5e&qid=1701899431&refinements=p_n_feature_nine_browse-bin%3A8529758011&rnid=8529757011&s=books&sr=1-7&ufe=app_do%3Aamzn1.fos.db68964d-7c0e-4bb2-a95c-e5cb9e32eb12

In [417]:
sales

Unnamed: 0,sale_id,date,customer_id,product_id,quantity
0,886200-Q0FOT,2019-09-23,983817-IVSOLM,287-PCo,1
1,886201-10GDM,2019-08-06,985032-MWLMA6,088-GNo,1
2,886202-IPC8Q,2020-01-08,984260-44FTJ5,24-Mns,2
3,886203-O6P43,2023-09-18,984908-LBA117,145-SSu,1
4,886204-AR6JL,2021-05-06,983914-2KKZMP,269-ACo,1
...,...,...,...,...,...
50827,620527-HV4GY,2018-01-18,984508-KCS4KE,03-Sor,1
50828,620528-PBLV1,2019-11-16,984430-QPWBSF,293-ICo,5
50829,620529-QET6W,2023-07-15,985039-PT2ZL1,29-Tns,1
50830,620530-HLIME,2022-06-12,984359-2RVNDF,080-DNo,5


In [418]:
# renomeando a coluna na tabela produto para dar match na tabela de vendas
product.rename(columns={'item_id': 'product_id',
                        'name': 'product_name'}, inplace=True)

In [419]:
# adiconando os preços e nomes dos produtos na tabela vendas
sales = pd.merge(sales, product[['product_id', 'price', 'product_name']], on='product_id', how='left')

In [420]:
# verificando uma amostra aleatória das vendas
sales.sample(n=10, random_state=42)

Unnamed: 0,sale_id,date,customer_id,product_id,quantity,price,product_name
18577,801777-JT795,2019-03-03,984308-YIL8SA,111-RNo,1,"2,000 gp",Ring of Mind Shielding
37264,684164-1HOK1,2020-03-23,984051-DTVV2Y,061-BNo,4,350 gp,Boots of the Winterlands
46966,593066-9DYEP,2022-02-03,984835-PCIFLW,42-Yus,2,10 gp,Yew Wand
38305,683705-IQWY9,2023-09-14,984674-XBFEJ2,286-PCo,1,200 gp,Prayer Bead - Bless
24865,719165-GLQSN,2018-09-25,984449-9KVLS9,080-DNo,5,300 gp,Driftglobe
15132,824432-F73FI,2023-04-03,983812-EJLA0G,54-Hrs,1,5 gp,Hunting Trap
43040,655640-XQY0F,2019-04-25,983878-13C9XQ,84-Srs,3,1 cp,Sack
14164,830264-SURL7,2019-06-22,984254-TKCD0R,159-SCo,1,"1,000 gp",Sword of Life-Stealing
33394,706794-NTBRW,2020-02-08,983712-P08J3T,168-SCo,2,"5,000 gp",Sunblade
49037,624137-EKSN4,2017-06-09,984124-AGKC4L,16-Brs,5,4 sp,Basket


Conferindo a estrutura da tabela:

| sale_id  | date    | customer_id | product_id    | product_name | quantity| product_price | 
| -------- | ------- | ----------- | --------------| -------------| ------- |-------------- |
| -------- |-------- | ----------- | --------------| -------------| ------- |-------------- |
| -------- | --------| ----------- | --------------| -------------| ------- |-------------- |

A tabela está finalizada, porém antes de transformar em um DB, vou criar algumas inconsistências.

## **Criando incosistências na tabela de vendas:**

**Motivação:**

Resolvi criar algumas incosistências para aumentar o nível de dificuldade para as pessoas que vão usar essa database para praticar análise de dados.

**O que será feito?**
- Gerar duplicatas
- Gerar valores vazios
- Gerar Outliers


### **Gerando Duplicatas:**

In [421]:
# duplicando 7% dos dados de forma aleatória
sales_copy = sales.sample(frac=0.07, random_state=42).copy()

In [422]:
sales_copy.head()

Unnamed: 0,sale_id,date,customer_id,product_id,quantity,price,product_name
18577,801777-JT795,2019-03-03,984308-YIL8SA,111-RNo,1,"2,000 gp",Ring of Mind Shielding
37264,684164-1HOK1,2020-03-23,984051-DTVV2Y,061-BNo,4,350 gp,Boots of the Winterlands
46966,593066-9DYEP,2022-02-03,984835-PCIFLW,42-Yus,2,10 gp,Yew Wand
38305,683705-IQWY9,2023-09-14,984674-XBFEJ2,286-PCo,1,200 gp,Prayer Bead - Bless
24865,719165-GLQSN,2018-09-25,984449-9KVLS9,080-DNo,5,300 gp,Driftglobe


Verificando duplicatas na tabela original:

In [423]:
sales.duplicated().sum()

0

In [424]:
sales = pd.concat([sales, sales_copy])

In [425]:
print("Duplicatas na tabela 'sales': ", sales.duplicated().sum())

Duplicatas na tabela 'sales':  3558


### **Gerando valores vazios:**

In [426]:
# verificando se há valores vazios
sales.isna().sum()

sale_id         0
date            0
customer_id     0
product_id      0
quantity        0
price           0
product_name    0
dtype: int64

In [427]:
# escolhendo aleatóriamente os índices das colunas que vou usar
sales_id_NA_idx = sales['sale_id'].sample(frac=0.001, random_state=42).index
customer_id_NA_idx = sales['customer_id'].sample(frac=0.001, random_state=43).index
product_id_NA_idx = sales['product_id'].sample(frac=0.002, random_state=44).index
product_name_NA_idx = sales['product_name'].sample(frac=0.003, random_state=45).index

In [428]:
# criando valores vazios
sales['sale_id'].loc[sales_id_NA_idx] = np.nan
sales['customer_id'].loc[customer_id_NA_idx] = np.nan
sales['product_id'].loc[product_id_NA_idx] = np.nan
sales['product_name'].loc[product_name_NA_idx] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['sale_id'].loc[sales_id_NA_idx] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['customer_id'].loc[customer_id_NA_idx] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['product_id'].loc[product_id_NA_idx] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['product_

In [429]:
# verificando se há valores vazios novamente
sales.isna().sum()

sale_id          66
date              0
customer_id      57
product_id      127
quantity          0
price             0
product_name    183
dtype: int64

### **Gerando Outliers:**

In [430]:
# escolhendo os índices
qty_outlier9_idx = sales['quantity'].sample(frac=0.005, random_state=42).index
qty_outlier6_idx = sales['quantity'].sample(frac=0.002, random_state=43).index
qty_outlier30_idx = sales['quantity'].sample(frac=0.001, random_state=44).index
qty_outlier50_idx = sales['quantity'].sample(frac=0.001, random_state=45).index

In [431]:
# mudando os valores
sales['quantity'].loc[qty_outlier9_idx] = 9999
sales['quantity'].loc[qty_outlier6_idx] = 6666
sales['quantity'].loc[qty_outlier30_idx] = 30
sales['quantity'].loc[qty_outlier50_idx] = 50

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['quantity'].loc[qty_outlier9_idx] = 9999
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['quantity'].loc[qty_outlier6_idx] = 6666
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['quantity'].loc[qty_outlier30_idx] = 30
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['quantity'].loc[qty_ou

In [432]:
sales.sort_values(by='quantity', ascending=False)

Unnamed: 0,sale_id,date,customer_id,product_id,quantity,price,product_name
34685,702885-PCSI9,2020-09-04,984417-8EL501,30-Crs,9999,5 gp,Chest
39583,678283-T84XA,2018-04-26,983679-YNCXLL,088-GNo,9999,300 gp,Gloves of Thievery
49061,624161-81CUM,2020-06-14,984058-ZCTWJL,44-Frs,9999,2 cp,Flask or Tankard
6196,840596-HX4NW,2018-06-24,983981-6L1F90,129-BSu,9999,"8,000 gp",Bowl of Commanding Water Elementals
26960,,2020-03-22,983784-9MOGE4,85-Srs,9999,5 gp,"Scale, merchant's"
...,...,...,...,...,...,...,...
13658,795258-HOI1L,2017-08-07,984556-JG2Y8R,077-DNo,1,"5,000 gp",Daern's Instant Fortress
27920,741220-4K1P8,2021-06-23,983862-RR7S20,21-Lns,1,10 gp,Lance
34692,702892-9LDDA,2018-10-12,984659-5U6JZV,182-MCo,1,"3,000 gp",Mace of Disruption
46500,636800-UA32T,2019-02-22,984893-CO3S65,12-Por,1,1500 gp,Plate


Agora o dataset está pronto para ser tramfsormado em um bando de dados SQL.

<hr style="height:3px;border-width:0;color:blue;background-color:blue">

# **Transformando em um BD SQLite.**

Como eu pretendo disponibilizar esse BD no Kaggle, eu vou tranformar em um BD SQLite por ser mais comum na plataforma.

**Minhas tabelas:**
* adventure_gear
* magic_items_filtered
* weapons
* potions_filtered
* poisons
* armor
* product
* sales
* costumers

In [433]:
import sqlite3

In [434]:
adventure_gear.head()

Unnamed: 0,item_id,name,price,weight,category,type
0,01-Ars,Abacus,2 gp,2 lb.,Others,adventure_gear
1,02-Ars,Acid (vial),25 gp,1 lb.,Others,adventure_gear
2,03-Ars,Alchemist's Fire (flask),50 gp,1 lb.,Others,adventure_gear
3,04-Aon,Arrows (20),1 gp,1 lb.,Ammunition,adventure_gear
4,05-Bon,Blowgun Needle (50),1 gp,1 lb.,Ammunition,adventure_gear


In [435]:
magic_items_filtered.head()

Unnamed: 0,item_id,name,price,rarity,category,type
0,001-ACo,Ammunition +1 (Per),15 gp,Uncommon,Consumable Items,magic_item
1,002-ACo,Ammunition +2 (Per),50 gp,Rare,Consumable Items,magic_item
4,005-BCo,Bead of Force,"1,000 gp",Rare,Consumable Items,magic_item
5,006-CCo,Chime of Opening,400 gp,Rare,Consumable Items,magic_item
6,007-DCo,Deck of Illusions,900 gp,Uncommon,Consumable Items,magic_item


In [436]:
weapons.head()

Unnamed: 0,item_id,name,price,damage,weight,properties,category,type
0,01-Cns,Club,1 sp,1d4 Bludgeon,2 lb.,Light,Simple Melee Weapons,weapon
1,02-Dns,Dagger,2 gp,1d4 Piercing,1 lb.,"Finesse, Light, Thrown (20/60)",Simple Melee Weapons,weapon
2,03-Gns,Greatclub,2 sp,1d8 Bludgeon,10 lb.,Two-handed,Simple Melee Weapons,weapon
3,04-Hns,Handaxe,5 gp,1d6 Slashing,2 lb.,"Light, Thrown (20/60)",Simple Melee Weapons,weapon
4,05-Jns,Javelin,5 sp,1d6 Piercing,2 lb.,Thrown (30/120),Simple Melee Weapons,weapon


In [437]:
armor.head()

Unnamed: 0,item_id,name,price,ac,weight,requirements,stealth,category,type
0,01-Por,Padded,5 gp,11 + Dex,8 lb.,,Disadvantage,Light Armor,armor
1,02-Lor,Leather,10 gp,11 + Dex,10 lb.,,,Light Armor,armor
2,03-Sor,Studded Leather,45 gp,12 + Dex,13 lb.,,,Light Armor,armor
3,04-Hor,Hide,10 gp,12 + Dex(max2),12 lb.,,,Medium Armor,armor
4,05-Cor,Chain Shirt,50 gp,13 + Dex(max2),20 lb.,,,Medium Armor,armor


In [438]:
potions_filtered.head()

Unnamed: 0,item_id,name,price,rarity,type
0,01-Pon,Potion of Healing,50 gp,Common,potion
1,02-Pon,Potion of Greater Healing,150 gp,Uncommon,potion
2,03-Pre,Potion of Superior Healing,450 gp,Rare,potion
4,05-Ere,Elixir of Health,120 gp,Rare,potion
5,06-Pon,Potion of Poison,100 gp,Uncommon,potion


In [439]:
poisons.head()

Unnamed: 0,item_id,name,price,category,dc,type
0,01-Aed,Assassin's blood,150 gp,Ingested,10.0,poison
1,02-Ted,Truth serum,150 gp,Ingested,11.0,poison
2,03-Cct,Carrion crawler mucus,200 gp,Contact,13.0,poison
3,04-Dry,Drow poison,200 gp,Injury,13.0,poison
4,05-Sry,Serpent venom,200 gp,Injury,11.0,poison


In [440]:
product.head()

Unnamed: 0,product_id,product_name,price,type
0,001-ACo,Ammunition +1 (Per),15 gp,magic_item
1,002-ACo,Ammunition +2 (Per),50 gp,magic_item
2,005-BCo,Bead of Force,"1,000 gp",magic_item
3,006-CCo,Chime of Opening,400 gp,magic_item
4,007-DCo,Deck of Illusions,900 gp,magic_item


In [441]:
sales.head()

Unnamed: 0,sale_id,date,customer_id,product_id,quantity,price,product_name
0,886200-Q0FOT,2019-09-23,983817-IVSOLM,287-PCo,1,300 gp,Prayer Bead - Smiting
1,886201-10GDM,2019-08-06,985032-MWLMA6,088-GNo,1,300 gp,Gloves of Thievery
2,886202-IPC8Q,2020-01-08,984260-44FTJ5,24-Mns,2,15 gp,Morningstar
3,886203-O6P43,2023-09-18,984908-LBA117,145-SSu,1,"5,600 gp",Silver Horn of Valhalla
4,886204-AR6JL,2021-05-06,983914-2KKZMP,269-ACo,1,"3,000 gp",Amulet of Health


In [442]:
customers.head()

Unnamed: 0,customer_id,name,sex,race,age,class
0,983638-753LC5,Veklani Daargen,female,Elf,661,Warlock
1,983639-8DRC11,Kasaki Wygarthe,female,Half-Elf,100,Wizard
2,983640-ERTJ5P,Rosalyn Faringray,female,Halfling,40,Barbarian
3,983641-HT0HL9,Atalya Webb,female,Tiefling,55,Monk
4,983642-XPSEIM,Grenenzel Lyfalia,female,Half-Orc,43,Bard


In [443]:
# criando conexão
conn = sqlite3.connect("adventurer_mart.db")

Inserindo as tabelas no BD:

In [444]:
adventure_gear.to_sql('details_adventure_gear', conn, index=False, if_exists='replace')
magic_items_filtered.to_sql('details_magic_items', conn, index=False, if_exists='replace')
weapons.to_sql('details_weapons', conn, index=False, if_exists='replace')
armor.to_sql('details_armor', conn, index=False, if_exists='replace')
potions_filtered.to_sql('details_potions', conn, index=False, if_exists='replace')
poisons.to_sql('details_poisons', conn, index=False, if_exists='replace')
product.to_sql('all_products', conn, index=False, if_exists='replace')
sales.to_sql('sales', conn, index=False, if_exists='replace')
customers.to_sql('costumers', conn, index=False, if_exists='replace')


1423

In [445]:
#fechando a conexão
conn.close()

**FIM.**