In [None]:
#| default_exp data

# Data -- Getting and processing the data

> A simple package for loading and saving pklmart data

In practice, you probably just need to use: 
```python
from pklshop.data import *
```
which will load all the tables in table_names into pandas dataframes which are then usable as variables. You can probably ignore the rest of the functions here unless you want to directly access the pklmart SQL database.

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *
import pandas as pd
import sys
sys.path.append('..')


In [None]:
#| export
from pklshop.connect import *
import pkgutil
from io import BytesIO
import pandas as pd

In [None]:
#| export
table_names = ["tournament", "match", "game", "rally", "shot_type_ref", "shot", "player", "team",]

In [None]:
table_names

['tournament',
 'match',
 'game',
 'rally',
 'shot_type_ref',
 'shot',
 'player',
 'team']

These are the names of the tables that exist in the pklmart database. 

In [None]:
#| export
#This is a function to get the dataframes from the csv files
def load_dfs_from_csv():
    "Returns a dictionary of dataframes from the table csv files"
    table_dict = {}
    for table_name in table_names:
        table_dat = pkgutil.get_data('pklshop', f"datasets/{table_name}.csv")
        df = pd.read_csv(BytesIO(table_dat))
        table_dict[table_name] = df
    return table_dict

table_dict = load_dfs_from_csv()

rally = table_dict["rally"]
players = table_dict["player"]
game = table_dict["game"]
team = table_dict["team"]
match = table_dict["match"]
shot = table_dict["shot"]
tournament = table_dict["tournament"]

We can load these tables into a data frame using `get_tab_as_df`. Note that pklshop comes with the data convienently loaded into pandas dataframes so you don't need to use this function unless you want to connect directly to the database.

In [None]:
#| export
def get_tab_as_df(table_name:str):
    "Returns a pandas dataframe for a given table"
    if not isinstance(table_name, str):
        raise TypeError(f"table_name must be a string within {table_names}")
    if table_name not in table_names:
        raise ValueError(f"Table name {table_name} is not a name in table_names")
    params = config()
    conn = DbConnection(params)
    df = conn.pull_data(table_name)
    return df

Importing pklshop.data automatically calls `load_dfs_from_csv` for each table so the dataframes are already loaded from import. Here's an example of creating and displaying a df from the `match` table.

In [None]:
match.head()

Unnamed: 0,match_id,tourn_id,consol_ind,team_id_1,team_id_2,maint_dtm,maint_app,create_dtm,create_app
0,M1,T1,N,T1,T2,2022-04-09 03:19:33.840951+00:00,postgres,2022-04-09 03:19:33.840951+00:00,postgres
1,M2,T2,N,T2,T3,2022-05-26 00:45:11.301752+00:00,postgres,2022-05-26 00:45:11.301752+00:00,postgres
2,M5,T5,N,T6,T5,2022-06-28 00:40:22.948360+00:00,postgres,2022-06-28 00:40:22.948360+00:00,postgres
3,M6,T6,N,T5,T7,2022-07-07 23:01:45.921540+00:00,postgres,2022-07-07 23:01:45.921540+00:00,postgres
4,M7,T7,N,T8,T9,2022-07-11 02:40:50.597016+00:00,postgres,2022-07-11 02:40:50.597016+00:00,postgres


In [None]:
#| hide
#Ensure the table name is the correct type and is in the database
test_fail(lambda: get_tab_as_df(1), contains="table_name must be a string within")
test_fail(lambda: get_tab_as_df("match1"), contains="Table name match1 is not a name in table_names")

In [None]:
#| export
#Pull data from the database and save it to csv files. Only need to do this when the datbase is updated.
def database_tables_to_csv():
    "Saves the dataframes to csv files"
    for table_name in table_names:
        df = get_tab_as_df(table_name)
        df.to_csv(f"datasets/{table_name}.csv", index=False)

Again, `database_tables_to_csv` is only needed when the database itself is updated. You will likely not need to use this function 

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()