# Data loading


---

## Load data from .csv files

In [7]:
from pathlib import Path
import pandas as pd

# LOAD ALL THE DATA

FINAL_DIR = Path("../data/final")
all_csv_files = FINAL_DIR.glob("*.csv")
dataframes = {f.stem: pd.read_csv(f, sep=",", low_memory=False) for f in all_csv_files}

variables_per_df = {df_name: df.columns.tolist() for df_name, df in dataframes.items()}

In [9]:
for df_name, vars in variables_per_df.items():
    print(f"{df_name}: {vars}")

root_genres: ['root_id', 'root_name']
books: ['isbn', 'title', 'authors', 'publication_year', 'publisher', 'image_url_s', 'image_url_m', 'image_url_l', 'description', 'price_usd', 'genre', 'root_genres', 'subgenres', 'regional_tags', 'image_alternative', 'previewlink', 'infolink', 'rating_score', 'r_category', 'popularity', 'popularity_cat', 'r_total', 'r_count', 'r_avg', 'r_std', 'recent_count']
book_root_genres: ['isbn', 'root_id']
ratings: ['user_id', 'isbn', 'rating', 'r_seq_user', 'r_seq_book', 'r_cat', 'ratings_seq']
users: ['user_id', 'age', 'age_group', 'gender', 'location', 'country', 'latitude', 'longitude', 'reader_level', 'critic_profile', 'mean_rating', 'median_rating', 'std_rating', 'total_ratings', 'total_books', 'explicit_ratings', 'has_ratings', 'pref_pub_year', 'pref_root_genres', 'pref_subgenres', 'pref_authors', 'pref_publisher', 'pref_price_min', 'pref_price_max', 'pref_price_avg', 'has_preferences']
book_subgenres: ['isbn', 'subgenre_id']
subgenres: ['subgenre_id'


--- 

## Split MySQL and MongoDB data loading

In [10]:
data_loading_variables = {
    'MySQL':{
        'tables': {
            'ratings': {
                'df_name': 'ratings',
                'columns': ['user_id', 'isbn', 'rating', 'r_seq_user', 'r_seq_book', 'r_cat', 'ratings_seq']
                },
            'books': {
                'df_name': 'books',
                'columns': ['isbn', 'title', 'authors', 'publication_year', 'publisher']
                },
            'users': {
                'df_name': 'users',
                'columns': ['user_id', 'age', 'age_group', 'gender', 'location', 'country', 'latitude', 'longitude', 'has_ratings', 'has_preferences']
                },
            'book_root_genres': {
                'df_name': 'book_root_genres',
                'columns': ['isbn', 'root_id']
                },
            'book_subgenres': {
                'df_name': 'book_subgenres',
                'columns': ['isbn', 'subgenre_id']
                },
            'root_genres': {
                'df_name': 'root_genres',
                'columns': ['root_id', 'root_name']
                },
            'subgenres': {
                'df_name': 'subgenres',
                'columns': ['subgenre_id', 'subgenre_name', 'root_id']
                },
            }
        },
    'MongoDB':{
        'collections': {
            'books' : {
                'df_name': 'books',
                'id' :     'isbn',
                # We might devide, because:
                # 1. Doesn't change but is only used as a helper so that MySQL isn't so sparce
                'book_extra_metadata' :    ['price_usd', 'genre', 'root_genres', 'subgenres', 'regional_tags', 'image_alternative', 'previewlink', 'infolink', 'image_url_s', 'image_url_m', 'image_url_l', 'description'],
                # 2. Changes a everytime a new rating is added
                'book_profile' : {
                    'rating_metrics' :     ['rating_score', 'r_category', 'r_total', 'r_count', 'r_avg', 'r_std'],
                    'popularity_metrics' : ['recent_count', 'popularity', 'popularity_cat'],
                }
            },
            'users' : {
                'df_name': 'users',
                'id' :     'user_id',
                'user_profile' :          ['reader_level', # It was derived from ratings but can be given by the user
                                           'critic_profile', 'mean_rating', 'median_rating', 'std_rating', 'total_ratings', 'total_books',  'explicit_ratings',  'has_ratings', 'has_preferences'],
                'user_preferences':       ['pref_pub_year', 'pref_root_genres', 'pref_subgenres', 'pref_authors', 'pref_publisher', 'pref_price_min', 'pref_price_max', 'pref_price_avg']
            }
        }
    }
}

The rest is done using the following commands:

>```bash
>cd scripts
>python load_databases.py
>```