<a href="https://colab.research.google.com/github/Pikarz/data_clean_and_validation_from_olddoc_to_cardmaster/blob/main/03%20-%20fix_names_inconsistencies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fix names inconsistencies

When inserting a lot of data, cards in this context, mistakes can happen. IE: I inserted a 'Charizard', but the combination of expansion and number matches a Blastoise. This is certainly wrong, because I actually meant a Charizard because I manually inserted it as name.

This notebook checks if the combination of name and the couple (expansion, number) is coherent. If it is not, it asks for the user's input and it updates the .csv file accordingly.

This notebook takes in input the output of the 02 notebook and returns an updated .csv with two new attributes:
- status_name an integer that reflects the state of the card. if status_name=0, then the card has already been checked and it is coherent with itself. If status_name=-1, then the card is yet to be checked. If status_name=1, then the card presents some kind of errors
- comment_status that is used to explain common errors within the card row.

In [None]:
import pandas as pd
import psycopg2
import re

def remove_lv_substring(original_string):
    # Define a pattern using a regular expression to match 'lv.' or 'liv.' followed by an optional space and 1-3 numbers
    pattern = re.compile(r'(li?v)\.\s?\d{1,3}', flags=re.IGNORECASE)

    # Use re.sub to replace the matched pattern with an empty string
    result_string = re.sub(pattern, '', original_string)

    return result_string.strip()

def is_japanese_expansion(curr, expansion):
  query = """
    SELECT 1
    FROM Cardexpansionjap
    WHERE id = %s
  """
  curr.execute(query, (expansion,))
  result = curr.fetchone()
  return bool(result)

def remove_clutter(cardname, original_cardname): # assumes cardname is always lower while original is, in fact, original
  if ' full art' in cardname:
    cardname = cardname.replace(' full art', '')
  if ' baby shiny' in cardname:
    cardname = cardname.replace(' baby shiny', '')
  if ' shiny' in cardname:
    cardname = cardname.replace(' shiny', '')
  if 'lucente' in cardname:
    cardname = cardname.replace(' lucente', '')
  if ' di alola' in cardname:
    cardname = cardname.replace(' di alola', '')
  if ' di galar' in cardname:
    cardname = cardname.replace(' di galar', '')
  if 'delta species' in cardname:
    cardname = cardname.replace('delta species', '')
  if 'specie delta' in cardname:
    cardname = cardname.replace('specie delta', '')
  if ' delta' in cardname:
    cardname = cardname.replace(' delta', '')
  if original_cardname.endswith(' TG'):
    cardname = cardname.replace(' tg', '')
  if original_cardname.endswith(' GG'):
    cardname = cardname.replace(' gg', '')
  if original_cardname.endswith(' VSTAR'):
    cardname = cardname.replace(' vastro', 'vstar')
  if cardname.endswith(' alt art'):
    cardname = cardname.replace(' alt art', '')
  if cardname.endswith(' hyper'):
    cardname = cardname.replace(' hyper', '')
  cardname = remove_lv_substring(cardname)

  return cardname.strip()

def remove_version(cardname):
  if ' holo reverse' in cardname:
    cardname = cardname.replace(' reverse', '')
  if ' cracked ice' in cardname:
    cardname = cardname.replace(' cracked ice', '')
  if ' holo' in cardname:
    cardname = cardname.replace(' holo', '')
  if ' stamped' in cardname:
    cardname = cardname.replace(' stamped', '')
  if '1st ed' in cardname:
    cardname = cardname.replace(' 1st ed', '')
  if ' reverse' in cardname:
    cardname = cardname.replace(' reverse', '')
  if ' jumbo' in cardname:
    cardname = cardname.replace(' jumbo', '')

  return cardname.strip()


def clean_cardname(cardname, cardexpansion): # removes all (auspicably) unnecessary stuff leaving only the Pokemon's name.
  new_cardname = cardname.lower().strip()

  new_cardname = remove_version(new_cardname)
  new_cardname = remove_clutter(new_cardname, cardname)

  if "'s " in new_cardname and (cardexpansion=="GH" or cardexpansion=="GC" or cardexpansion=="CFTD" or cardexpansion=="LST"):
    new_cardname = new_cardname.split()[-1] # if the cardname contains the substring "'s" and its expansion is one of those, then it's a gym card and the pokemon is at the final part of the string.
  elif cardname[1].isupper() and new_cardname.startswith('m') and new_cardname.endswith(' ex'): # to handle Mega EX cards, IE: MGyarados EX
    new_cardname = new_cardname[:1].split()[0] # removes the M and the EX

  return new_cardname.strip()

def get_card_name(curr, card_expansion, card_number):
  card_name = None
  sub_expansion = None

  query = """
          SELECT CardType.name, CardType.expansion
          FROM CardType
          WHERE CardType.expansion = %s AND CardType.number = %s
  """

  curr.execute(query, (card_expansion, card_number,))
  result = curr.fetchone()
  if result:
    card_name = result[0]
    sub_expansion = result[1]
    print(f"The card is {card_name}")
  else: # the card could be in a sub-expansion
      query = """
        SELECT CardType.name, Cardexpansion.id
        FROM CardType, Cardexpansion
        WHERE Cardexpansion.super_expansion = %s
        AND CardType.expansion = Cardexpansion.id
        AND CardType.number = %s
      """
      curr.execute(query, (card_expansion, card_number,))
      result = curr.fetchone()
      if result:
        card_name = result[0]
        sub_expansion = result[1]
        print(f"The card is {card_name}")

  return card_name, sub_expansion

def update_card(curr, row, is_updated_expansion):
    expansion_input = row['Set']
    number_input = row['#']
    status = None
    while True:
      if is_updated_expansion:
        expansion_input = input("Insert the new expansion id (case sensitive)\n")
      number_input = input("Insert the new number value (case sensitive)\n")
      actual_cardname, sub_expansion = get_card_name(curr, expansion_input, number_input)

      if actual_cardname is None:
          error_input = input('Error! The card does not exist\nDo you want to flag the card as an error? (y/n)\n')
          if error_input.lower() == 'y':
              status = 1
              break
          is_updated_expansion=True
          continue

      confirm_input = input(f"The new card is {actual_cardname}. Is this correct? (y/n/e to flag it as an error)\n")
      if confirm_input.lower() == 'y':
          status = 0
          break
      if confirm_input.lower() == 'e':
          status = 1
          break
      is_updated_expansion=True
    return status, expansion_input, number_input

def choice_for_card(curr, row, error=False):
  status = None
  sub_expansion = row['Set']
  number = row['#']
  msg_string = "What do you want to do?\n1: OK\n2: Change card number\n3: Change expansion and card number\n4: Flag it as error\n"
  if error:
    msg_string.replace('1: OK\n', '')
  while True:
    choice_input = input(msg_string)
    if choice_input == '1' and not error:
      status = 0
      break
    elif choice_input == '2':
      status, sub_expansion, number = update_card(curr, row, False)
      break
    elif choice_input == '3':
      status, sub_expansion, number = update_card(curr, row, True)
      break
    elif choice_input == '4':
      status = 1 # flag it as error
      break
    else:
      print('Unrecognized command\n')
  return status, sub_expansion, number


def fix_names_inconsistencies(db_params, csv_path):
  input_date_check = input("Did you run check_date_consistency() and check_language()?")
  if input_date_check.lower() != 'y':
    return
  conn = None
  curr = None
  try:
    conn = psycopg2.connect(**db_params)
    curr = conn.cursor()
  except Exception as e:
    print(f"Error while connecting to the DB: {e}")
    return

  df = pd.read_csv(csv_path)
  if 'status_name' not in df.columns:
    df['status_name'] = -1
  if 'comment_status' not in df.columns:
    df['comment_status'] = ''
  for i, row in df.iterrows():
    if row['status_name'] != -1 or row['date_comment'] != 'OK' or row['language_comment'] != 'OK': # if a row has a status different than -1 it has already been checked, so we skip it. if a row has not a valid date and language, we skip it.
      continue

    is_jap_expansion = is_japanese_expansion(curr, row['Set'])
    is_jap_lang = row['Lang'] == 'JAP'
    row['#'] = str(row['#']).replace('TG0', 'TG')

    actual_cardname, expansion = get_card_name(curr, row['Set'], row['#']) # returns the card's name associated to the row that will be used to check for inconsistencies. also returns the sub-expansion, so the cards' actual expansion according to DB. IE: CRZ_GG for the GG69 Giratina from Crown Zenith
    cleaned_cardname = clean_cardname(row['Card'], row['Set'])
    if not actual_cardname: # the card doesn't exists in the DB
      print(f"The card {row['Set']} #{row['#']} does not exists in the database.\nCurrent card name: {row['Card']}")
      status, expansion, number = choice_for_card(curr, row, error=True)
    elif cleaned_cardname in actual_cardname.lower() and (is_jap_expansion == is_jap_lang): # the card name is correct and the assiciation expansion-language is valid
      df.at[i, 'status_name'] = 0
      df.to_csv(csv_path, index=False)
      print(f"The card at row {i} is correct!")
      continue
    else: # the card exists but the names do not match
      print(f"The names do not match for card: {row['Set']} #{row['#']}\nCard name: {row['Card']}\n***Expected: {actual_cardname}***")
      status, expansion, number = choice_for_card(curr, row)

    df.at[i, 'status_name'] = status
    df.at[i, 'Set'] = expansion
    df.at[i, '#'] = number
    df.to_csv(csv_path, index=False)

    is_jap_expansion = is_japanese_expansion(curr, expansion)
    if not (is_jap_expansion == is_jap_lang): # the expansion is japanese if and only if the language is JAP
      print("There's an inconsistency with expansion/language. Flagging it as error")
      df.at[i, 'status_name'] = 1
      df.at[i, 'comment_status'] = 'Expansion / language inconsistency'
      df.to_csv(csv_path, index=False)
      continue

    else:
      print(f'Inserted the new status {status} for card in row {i}!\n\n')

  print('### Completed! ###')

# GENERAL TODO:
# - check if dates have the correct format
# - parse version


In [None]:
data_path = 'H:/My Drive/pkmn/pok_swap.csv'

db_params = {
    'host': '192.168.example.example',
    'port': '5432',
    'database': 'example',
    'user': 'example',
    'password': 'example'
}

# Assuming data_path has the column [status_name] which can have three different statuses:
#   -1: unchecked, me aning that the row must still be checked
#    0: OK, which means that the row has been checked and the tuple (expansion, card_number) is correct
#    1: error; the row requires a manual fix (shouldn't happen unless rare cases, IE lamincards or other collectables)
fix_names_inconsistencies(db_params, data_path)

Did you run check_date_consistency() and check_language()?y
The card is Zapdos
The card at row 0 is correct!
The card is Mewtwo
The card at row 1 is correct!
The card is Sabrina's Mr. Mime
The card at row 2 is correct!
The card is Sabrina's Slowbro
The card at row 3 is correct!
The card is Blaine's Growlithe (lv 15)
The card at row 4 is correct!
The card is Blaine's Ponyta
The card at row 5 is correct!
The card is Brock's Mankey (lv 10)
The card at row 6 is correct!
The card is Lt. Surge's Voltorb
The card at row 7 is correct!
The card is Lt. Surge's Treaty
The card at row 8 is correct!
The card is Erika's Bulbasaur
The card at row 9 is correct!
The card is Erika's Jigglypuff
The card at row 10 is correct!
The card is Koga's Pidgey (Lv 15)
The card at row 11 is correct!
The card is Misty's Psyduck
The card at row 12 is correct!
The card is Giovanni
The card at row 13 is correct!
The card is Lt. Surge's Secret Plan
The card at row 14 is correct!
The card is Misty's Wish
The card at row 

In [None]:
# sanity check
data_path = 'H:/My Drive/pkmn/pok_ok.csv'

db_params = {
    'host': '192.168.1.2',
    'port': '5432',
    'database': 'cardmaster',
    'user': 'nemo',
    'password': '322322'
}

conn = None
curr = None
try:
  conn = psycopg2.connect(**db_params)
  curr = conn.cursor()
except Exception as e:
  print(f"Error while connecting to the DB: {e}")

df = pd.read_csv(data_path)
for i, row in df.iterrows():
  name = None
  name = get_card_name(curr, row['Set'], row['#'])
  if name is None:
    print(f"ERROR: row {row} at index {i}")
    break

The card is Claydol
The card is Baltoy
The card is Golurk
The card is Golurk
The card is Lugia EX
The card is Primal Kyogre
The card is Kyurem EX
The card is Bellossom
The card is Electrode
The card is Kingdra
The card is Magneton
The card is Octillery
The card is Apricorn Forest
The card is Azumarill
The card is Vileplume
The card is Octillery
The card is Victreebel
The card is Suicune
The card is Suicune
The card is Umbreon
The card is Steelix
The card is Espeon
The card is Houndour
The card is Umbreon
The card is Houndoom
The card is Umbreon
The card is Vileplume
The card is Umbreon
The card is Ampharos
The card is Arcanine
The card is Ariados
The card is Azumarill
The card is Bellossom
The card is Blissey
The card is Donphan
The card is Electrode
The card is Elekid
The card is Entei
The card is Espeon
The card is Exeggutor
The card is Houndoom
The card is Houndoom
The card is Hypno
The card is Jumpluff
The card is Jynx
The card is Kingdra
The card is Lanturn
The card is Lanturn
The