<a href="https://colab.research.google.com/github/Pikarz/data_clean_and_validation_from_olddoc_to_cardmaster/blob/main/04%20-%20insert_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# insert_version
After notebook_03, the card is coherent and corrected. But the same card can have multiple versions. IE: A Venusaur from the Base set can be either 'default' or 'First Edition'.

This notebook's objective is to effectively deduce, or asks for user-input when the deduction is not possible or not trivial, the card's version giving the information present in each row.

This notebook takes in input the output of the notebook 03 and updates it with two new columns:
- version: states the exact version of the card
- comment_version: that states possible errors found during the process

In [None]:
import pandas as pd
import psycopg2
import re

def remove_lv_substring(original_string):
    # Define a pattern using a regular expression to match 'lv.' or 'liv.' followed by an optional space and 1-3 numbers
    pattern = re.compile(r'(li?v)\.\s?\d{1,3}', flags=re.IGNORECASE)

    # Use re.sub to replace the matched pattern with an empty string
    result_string = re.sub(pattern, '', original_string)

    return result_string.strip()

def get_card_name(curr, card_expansion, card_number):
  card_name = None
  sub_expansion = None

  query = """
          SELECT CardType.name, CardType.expansion
          FROM CardType
          WHERE CardType.expansion = %s AND CardType.number = %s
  """

  curr.execute(query, (card_expansion, card_number,))
  result = curr.fetchone()
  if result:
    card_name = result[0]
    sub_expansion = result[1]
    print(f"The card is {card_name}")
  else: # the card could be in a sub-expansion
      query = """
        SELECT CardType.name, Cardexpansion.id
        FROM CardType, Cardexpansion
        WHERE Cardexpansion.super_expansion = %s
        AND CardType.expansion = Cardexpansion.id
        AND CardType.number = %s
      """
      curr.execute(query, (card_expansion, card_number,))
      result = curr.fetchone()
      if result:
        card_name = result[0]
        sub_expansion = result[1]
        print(f"The card is {card_name}")

  return card_name, sub_expansion

def clean_cardname(cardname, cardexpansion, original_cardname): # removes all (auspicably) unnecessary stuff leaving only the Pokemon's name.
  new_cardname = cardname.lower().strip()

  new_cardname = remove_clutter(new_cardname, original_cardname)

  if ' delta' in new_cardname: # if 'delta' is in its name, then I would like only the first part of the string before the first space
    new_cardname = new_cardname.split()[0]
  elif ' lucente ' in new_cardname: # same as delta
    new_cardname = new_cardname.split()[0]
  elif "'s " in new_cardname and (cardexpansion=="GH" or cardexpansion=="GC" or cardexpansion=="CFTD" or cardexpansion=="LST"):
    new_cardname = new_cardname.split()[-1] # if the cardname contains the substring "'s" and its expansion is one of those, then it's a gym card and the pokemon is the at the final part of the string.
  elif ' di alola' in new_cardname:
    new_cardname = new_cardname.replace(' di alola', '')
  elif ' di galar' in new_cardname:
    new_cardname = new_cardname.replace(' di galar', '')
  elif original_cardname[1].isupper() and new_cardname.startswith('m') and new_cardname.endswith(' ex'): # to handle Mega EX cards, IE: MGyarados EX
    new_cardname = new_cardname[:1].split()[0] # removes the M and the EX

  return new_cardname.strip()

def remove_clutter(cardname, original_cardname): # assumes cardname is always lower while original is, in fact, original
  if ' full art' in cardname:
    cardname = cardname.replace(' full art', '')
  if ' baby shiny' in cardname:
    cardname = cardname.replace(' baby shiny', '')
  if ' shiny' in cardname:
    cardname = cardname.replace(' shiny', '')
  if 'lucente' in cardname:
    cardname = cardname.replace(' lucente', '')
  if ' di alola' in cardname:
    cardname = cardname.replace(' di alola', '')
  if ' di galar' in cardname:
    cardname = cardname.replace(' di galar', '')
  if 'delta species' in cardname:
    cardname = cardname.replace('delta species', '')
  if 'specie delta' in cardname:
    cardname = cardname.replace('specie delta', '')
  if ' delta' in cardname:
    cardname = cardname.replace(' delta', '')
  if ' vastro' in cardname:
    cardname = cardname.replace(' vastro', '')
  if original_cardname.endswith(' TG'):
    cardname = cardname.replace(' tg', '')
  if original_cardname.endswith(' GG'):
    cardname = cardname.replace(' gg', '')
  if original_cardname[1].isupper() and new_cardname.startswith('m') and new_cardname.endswith(' ex'): # to handle Mega EX cards, IE: MGyarados EX
    cardname = cardname[2:] # removes the M and the EX
  cardname = remove_lv_substring(cardname)

  return cardname.strip()

def removing_cardname(cardname, original_cardname):
    original_words = original_cardname.split()

    # Remove each word from cardname
    for word in original_words:
        cardname = cardname.lower().replace(word.lower(), '')

    if len(cardname) > 0:
      cardname = remove_clutter(cardname, original_cardname)

    return cardname.strip()

def deduce_version(cardname):
  version = None
  if cardname.endswith('holo reverse') or cardname.endswith('stamped') or cardname.endswith('reverse') or cardname.endswith(' reverse holo'):
      version = 'Reverse Holo'
  elif cardname.endswith('cracked ice'):
      version = 'Cracked Ice Holo'
  elif cardname.endswith('holo'):
      version = 'Holo'
  elif cardname.endswith('1st ed'):
      version = 'First Edition'
  elif cardname.endswith('jumbo'):
    version = 'Jumbo'
  return version


def get_versions(curr, expansion, number):
  query = """
    SELECT version
    FROM versioncardtype
    WHERE card_expansion = %s
    AND card_number = %s
  """
  curr.execute(query, (expansion, number))
  versions = [row[0] for row in curr.fetchall()]
  if len(versions) == 0:
    print('no versions found')
    query = """
      SELECT DISTINCT version
      FROM versioncardtype, cardexpansion
      WHERE versioncardtype.card_number = %s
      AND cardexpansion.super_expansion = %s
    """
    curr.execute(query, (number, expansion))
    versions = [row[0] for row in curr.fetchall()]
    print(f'new versions: {versions}')
  return versions

def manage_version_manual(df, i, cardname, original_cardname, available_versions, expansion, number):
  print(f"Cannot recognize the card's version for card {cardname} ({expansion}, {number}) (original cardname {original_cardname})")
  while True:
    msg_string = "Available versions are:\n"
    for index, version in enumerate(available_versions):
        msg_string += f" [{index}]: {version}\n"
    msg_string += "Which do you pick (choose the number associated to the version you want to pick)? Or press 'e' to flag the card as an error\n"
    version_input = input(msg_string)
    if version_input.lower() == 'e':
      df.at[i, 'version_comment'] = "ERROR! Couldn't find a version after the manual check"
      df.at[i, 'version'] = 'ERROR_VERSION'
      print("Flagged as error")
      break
    else:
      input_ver = int(version_input)
      if input_ver <= len(available_versions):
        df.at[i, 'version'] = available_versions[input_ver]
        confirm_input = input(f"You choose {available_versions[input_ver]} for card {cardname}. Is this correct? (y/n)")
        if confirm_input == 'y':
          print("Version successfully inserted!")
          break
      else:
        print('Unrecognized command\n')
        continue

  return df

def insert_versions(db_params, csv_path):
  input_check = input("Did you run check_date_consistency(), check_language() and fix_names_inconsistencies()?")
  if input_check.lower() != 'y':
    return
  conn = None
  curr = None
  try:
    conn = psycopg2.connect(**db_params)
    curr = conn.cursor()
  except Exception as e:
    print(f"Error while connecting to the DB: {e}")
    return

  df = pd.read_csv(csv_path)
  if 'version' not in df.columns:
    df['version'] = ''
    df.to_csv(csv_path, index=False)
  if 'version_comment' not in df.columns:
    df['version_comment'] = ''
    df.to_csv(csv_path, index=False)


  for i, row in df.iterrows():
    # if a row has a defined version, then it has already been checked so we skip it
    # if a row has not a status_name equal to 0, then it means that it has not been checked by the fix name inconsistencies or an error is present in that card
    if row['status_name'] != 0  or (not pd.isna(row['version']) and row['version']!='ERROR_VERSION'):
      continue

    number = str(row['#'])
    if number.startswith('TG0'):
      number = number.replace('TG0', 'TG')
    elif number.startswith('GG0'):
      number = number.replace('GG0', 'GG')
    original_cardname, _ = get_card_name(curr, row['Set'], number) # also returns the expansion
    available_versions = get_versions(curr, row['Set'], number)

    # error check
    if len(available_versions) == 0:
      df.at[i, 'version'] = 'ERROR_VERSION'
      df.at[i,'version_comment'] = 'ERROR: no versions are associated to this card'
      print("No versions were associated. Flagged it as error")
      df.to_csv(csv_path, index=False)
      continue

    # deducing default version by exclusion
    cleaned_cardname_from_original_card = removing_cardname(row['Card'], original_cardname)
    cleaned_cardname = clean_cardname(cleaned_cardname_from_original_card, row['Set'], original_cardname)
    if len(available_versions) == 1 or len(cleaned_cardname) == 0: # only one available version exists (default) or the string is empty after cleaning, meaning it hasn't any particular version
      if 'default' not in available_versions:
        df.at[i, 'version'] = 'ERROR_VERSION'
        df.at[i,'version_comment'] = "ERROR: default version doesn't exist for this card"
        print("Default version doesn't exist for this card. Flagged it as error")
        df.to_csv(csv_path, index=False)
        continue
      df.at[i, 'version'] =  'default'
      df.to_csv(csv_path, index=False)
      print(f"Default version added for row {i}!")
      continue

    # deducing by checking keywords in cardname
    version = None
    version = deduce_version(cleaned_cardname)
    version_found_but_not_available = version is not None and version not in available_versions
    if version_found_but_not_available: # a version was found but it's not in the available versions of that card
      print(f"A version was deduced ({version}) but it was not available for this card.")
      df = manage_version_manual(df, i, row['Card'], original_cardname, available_versions, row['Set'], row['#'])
      df.to_csv(csv_path, index = False)
      continue
    elif version is not None and version in available_versions: # a version has been deduced and it is in the available versions
      df.at[i, 'version'] = version
      df.to_csv(csv_path, index=False)
      print(f"Version successfully deduced for card {row['Card']}: {version}")
      continue
    elif version is None: # version is still None (it hasn't been deduced)
      # deducing the version by checking if it matches with one in available_versions
      for available_version in available_versions: # check if an available version matches with the version that is in the cardname after clean
        if available_version in cleaned_cardname:
          version = available_version
          df.at[i, 'version'] = version
          df.to_csv(csv_path, index=False)
          print(f"Version successfully deduced extensively for card {cardname}: {version}")
          break
      if version is None: # if after the extensive search the version is still none, then it has to be handled manually
        df = manage_version_manual(df, i, row['Card'], original_cardname, available_versions, row['Set'], row['#'])
        df.to_csv(csv_path, index=False)
        continue

  print("### Complete ###")

In [None]:
import pandas as pd
import psycopg2
data_path = 'H:/My Drive/pkmn/pok_swap.csv'

db_params = {
    'host': '192.168.1.2',
    'port': '5432',
    'database': 'cardmaster',
    'user': 'nemo',
    'password': '322322'
}

insert_versions(db_params, data_path)

Did you run check_date_consistency(), check_language() and fix_names_inconsistencies()?y
The card is Zapdos
Default version added for row 0!
The card is Mewtwo
Default version added for row 1!
The card is Sabrina's Mr. Mime
Default version added for row 2!
The card is Sabrina's Slowbro
Default version added for row 3!
The card is Blaine's Growlithe (lv 15)
Default version added for row 4!
The card is Blaine's Ponyta
Default version added for row 5!
The card is Brock's Mankey (lv 10)
Default version added for row 6!
The card is Lt. Surge's Voltorb
Default version added for row 7!
The card is Lt. Surge's Treaty
Default version added for row 8!
The card is Erika's Bulbasaur
Default version added for row 9!
The card is Erika's Jigglypuff
Default version added for row 10!
The card is Koga's Pidgey (Lv 15)
Default version added for row 11!
The card is Misty's Psyduck
Default version added for row 12!
The card is Giovanni
Default version added for row 13!
The card is Lt. Surge's Secret Plan
D

In [None]:
import pandas as pd
data_path = 'H:/My Drive/pkmn/pok_ok.csv'
df = pd.read_csv(data_path)
df = df.drop(columns = ['version', 'version_comment'])
df.to_csv(data_path, index=False)