<a href="https://colab.research.google.com/github/Pikarz/data_clean_and_validation_from_olddoc_to_cardmaster/blob/main/02%20-%20check_date_lang_consistency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Check date and language consistency
This notebook's main objective is to check if the date is in the correct format if it exists and if it doesn't it uses a default dete. The notebook also checks if the language field is consistent with the expansion of the collectible (IE: Expansion $y$ can only have cards that are Japanese).  

The notebook takes as input the output of the precedent notebook and adds the following fields:
- date_comment: states if the date has as correct format (OK) or not
- language_comment: states if the language is consistent with the expansion (OK) or not

In [None]:
import psycopg2
import pandas as pd
from datetime import datetime

def is_valid_date(date_string):
    formats = ["%Y-%m-%d", "%Y-%m-%d %H:%M:%S"]
    if pd.isna(date_string): # it's valid if it's empty
      return True

    for format_str in formats:
        try:
            datetime.strptime(date_string, format_str)
            return True
        except ValueError:
            pass

    return False

def check_date(db_params, data_path, default_date):
  conn = None
  curr = None
  try:
    conn = psycopg2.connect(**db_params)
    curr = conn.cursor()
  except Exception as e:
    print(f"Error while connecting to the DB: {e}")
    return

  df = pd.read_csv(data_path)
  if 'date_comment' not in df.columns:
    df['date_comment'] = ''

  for i, row in df.iterrows():
    if row['date_comment'] == 'OK': # if a row has a status different than -1 it has already been checked, so we skip it
      continue

    buying_date = row['B-Date']
    selling_date = row['S-Date']
    if pd.isna(buying_date):
      if not pd.isna(selling_date):
        print(f"Buying date is missing: inserting it as selling date for row {i}")
        buying_date = selling_date
      else:
        print(f"Buying date is missing: inserting it as default date for row {i}")
        buying_date = default_date
      df.at[i, 'B-Date'] = buying_date
    if pd.isna(selling_date) and not pd.isna(row['S-Price']):
      print(f"Selling date is missing: inserting it as buying date for row {i}")
      selling_date = buying_date
      df.at[i, 'S-Date'] = selling_date
    is_valid_buyingdate = is_valid_date(buying_date)
    is_valid_sellingdate = is_valid_date(selling_date)
    if not is_valid_buyingdate and not is_valid_sellingdate:
      df.at[i, 'date_comment'] = 'ERROR! Buying date and Selling date are not valid'
    elif not is_valid_buyingdate:
      df.at[i, 'date_comment'] = 'ERROR! Buying date is not valid'
    elif not is_valid_sellingdate:
      df.at[i, 'date_comment'] = 'ERROR! Selling date is not valid'
    else:
      df.at[i, 'date_comment'] = 'OK'
    df.to_csv(data_path, index=False)


In [None]:
def is_valid_language(curr, expansion, language):
    language_query = """
        SELECT language
        FROM AllowedExpansionLanguage
        WHERE expansion=%s
    """
    # Execute the query and fetch all results
    curr.execute(language_query, (expansion,))
    languages = curr.fetchall()

    # Check if the given language is in the list of languages
    return language in [lang[0] for lang in languages]

def check_language(db_params, data_path):
  conn = None
  curr = None
  try:
    conn = psycopg2.connect(**db_params)
    curr = conn.cursor()
  except Exception as e:
    print(f"Error while connecting to the DB: {e}")
    return

  df = pd.read_csv(data_path)
  if 'language_comment' not in df.columns:
    df['language_comment'] = ''

  for i, row in df.iterrows():
    if row['language_comment'] == 'OK':
      continue
    expansion = row['Set']
    language = row['Lang']
    if is_valid_language(curr, expansion, language):
      df.at[i, 'language_comment'] = 'OK'
    else:
      df.at[i, 'language_comment'] = 'ERROR: invalid language'
    df.to_csv(data_path, index=False)

In [None]:
data_path = 'H:/My Drive/pkmn/pok_swap.csv'

db_params = {
    'host': '192.168.example.example',
    'port': '5432',
    'database': 'cardmaster',
    'user': 'example',
    'password': 'example'
}

check_date(db_params, data_path, default_date='2020-01-01')
check_language(db_params, data_path)

Buying date is missing: inserting it as selling date for row 327
Buying date is missing: inserting it as selling date for row 328
Buying date is missing: inserting it as selling date for row 329
Buying date is missing: inserting it as selling date for row 330
Buying date is missing: inserting it as selling date for row 331
Buying date is missing: inserting it as selling date for row 332
Buying date is missing: inserting it as selling date for row 333
Buying date is missing: inserting it as selling date for row 334
Buying date is missing: inserting it as selling date for row 335
Buying date is missing: inserting it as default date for row 336
Buying date is missing: inserting it as selling date for row 337
Buying date is missing: inserting it as selling date for row 338
Buying date is missing: inserting it as selling date for row 339
Buying date is missing: inserting it as default date for row 340
Buying date is missing: inserting it as default date for row 341
Buying date is missing: i

In [None]:
import pandas as pd

df = pd.read_csv(data_path)
df = df.drop(columns = ['date_comment'])
df.to_csv(data_path, index=False)

KeyError: ignored