# Zhifeng's Card Data Cleaning Notebook

date: y2025m01d05

---

In [1]:
lines = open("cards_v0.csv", 'r').readlines()

In [3]:
def transform_line(original_line: str):
  ans_line = ""
  comma_count = 0
  for c in original_line:
    if c == ',':
      comma_count += 1
  curr_comma_count = 0
  for c in original_line:
    if c == ',':
      if curr_comma_count == 0 or curr_comma_count >= comma_count - 4:
        ans_line += ';'
      else:
        ans_line += ','
      curr_comma_count += 1
    else:
      ans_line += c
  return ans_line

In [2]:
lines

['card_id, description, name, keyword1, keyword2, keyword3\n',
 "1, 'A long-term partner with AAA,the company ME is the largest medicine distributor for influenza in Asia. ', 'S1T1C16', 'ME', 'Asia', 'Influenza'\n",
 "2, 'Suppose AAA tries to expand its business sector for influenza treatment in Asia. In that case, the largest medicine distributor in Asia, ME, will be a candidate with potential.', 'S1T1C17', 'ME', 'expansion', 'AAA'\n",
 "3, 'ME had stable growth in their sales revenue of 12%. Last year, the revenue was $450 million.', 'S1T1C1', 'ME', 'growth', 'revenue'\n",
 "4, 'AAA\\'s board members maintained a good relationship with ME\\'s CEO. They had several public and private meetings during the last five years.', 'S1T1C18', 'ME', 'AAA', 'relationship'\n",
 '5, \'The company pleaded guilty to the false promotion of prescription drug "Z" (which is related to influenza) in 2010, including kickbacks to doctors and pharmacists. All fines were paid.\', \'S1T1C25\', \'ME\', \'guilty

In [23]:
transform_line(lines[2])

"2; 'Suppose AAA tries to expand its business sector for influenza treatment in Asia. In that case, the largest medicine distributor in Asia, ME, will be a candidate with potential.'; 'S1T1C17'; 'ME'; 'expansion'; 'AAA'\n"

In [4]:
with open("cards_v1.csv", "w") as cards_v1_file:
  for line in lines[1:]:
    cards_v1_file.write(transform_line(line))

# Phase 2 Clean

Separate "name" column.

In [2]:
import pandas as pd

In [17]:
df = pd.read_csv("cards_v1.csv", sep=";")
df

Unnamed: 0,id,description,name,keyword1,keyword2,keyword3
0,1,"'A long-term partner with AAA,the company ME ...",'S1T1C16','ME','Asia','Influenza'
1,2,'Suppose AAA tries to expand its business sec...,'S1T1C17','ME','expansion','AAA'
2,3,'ME had stable growth in their sales revenue ...,'S1T1C1','ME','growth','revenue'
3,4,'AAA\'s board members maintained a good relat...,'S1T1C18','ME','AAA','relationship'
4,5,'The company pleaded guilty to the false prom...,'S1T1C25','ME','guilty','false promotion'
...,...,...,...,...,...,...
383,384,'The melting ice glaciers can cause massive d...,'S3T3C32','melt','damage','human '
384,385,'Sea level rising can be a false topic. There...,'S3T3C42','sea level','false topic','difference'
385,386,'The change in the geologic structures under ...,'S3T3C43','geologic','change','change'
386,387,'Sea level rise can also benefit human beings...,'S3T3C44','benefit','human ','water'


In [32]:
def parse_name_fn(name_str:str):
  ans_list = [0,0,0]
  parsing_num_flag = False
  num_i = 0
  for c_char in name_str:
    if c_char.isdigit():
      ans_list[num_i] = ans_list[num_i] * 10 + int(c_char)
      parsing_num_flag = True
    else:
      if parsing_num_flag == True:
        parsing_num_flag = False
        num_i += 1

  return (ans_list[0], ans_list[1], ans_list[2])

assert(parse_name_fn("S1T1C16") == (1,1,16)) 

In [14]:
df.columns

Index(['id', ' description', ' name', ' keyword1', ' keyword2', ' keyword3'], dtype='object')

In [75]:
COL_S_ID_NAME = "scenario_id"
COL_T_ID_NAME = "type_id"
COL_C_ID_NAME = "local_card_id"
COL_DESCRIPTION_NAME = "description"
COL_KEY_1_NAME = "keyword1"
COL_KEY_2_NAME = "keyword2"
COL_KEY_3_NAME = "keyword3"

In [76]:
df[COL_S_ID_NAME] = df["name"].transform(lambda x: parse_name_fn(x)[0])
df[COL_T_ID_NAME] = df["name"].transform(lambda x: parse_name_fn(x)[1])
df[COL_C_ID_NAME] = df["name"].transform(lambda x: parse_name_fn(x)[2])

## Clean Description Column

In [77]:
DESCRITPION_TRIM_STR = ' \''
def clean_description_fn(description_str:str):
  return description_str.lstrip(DESCRITPION_TRIM_STR).rstrip(DESCRITPION_TRIM_STR)

assert(clean_description_fn("  'hello world' ") == "hello world")

In [78]:
for col_name in [COL_DESCRIPTION_NAME, COL_KEY_1_NAME, COL_KEY_2_NAME, COL_KEY_3_NAME]:
  df[col_name] = df[col_name].transform(clean_description_fn)
df

Unnamed: 0,id,description,name,keyword1,keyword2,keyword3,scenario_id,turn_id,local_card_id,type_id
0,1,"A long-term partner with AAA,the company ME is...",'S1T1C16',ME,Asia,Influenza,1,1,16,1
1,2,Suppose AAA tries to expand its business secto...,'S1T1C17',ME,expansion,AAA,1,1,17,1
2,3,ME had stable growth in their sales revenue of...,'S1T1C1',ME,growth,revenue,1,1,1,1
3,4,AAA\'s board members maintained a good relatio...,'S1T1C18',ME,AAA,relationship,1,1,18,1
4,5,The company pleaded guilty to the false promot...,'S1T1C25',ME,guilty,false promotion,1,1,25,1
...,...,...,...,...,...,...,...,...,...,...
383,384,The melting ice glaciers can cause massive dam...,'S3T3C32',melt,damage,human,3,3,32,3
384,385,Sea level rising can be a false topic. There a...,'S3T3C42',sea level,false topic,difference,3,3,42,3
385,386,The change in the geologic structures under th...,'S3T3C43',geologic,change,change,3,3,43,3
386,387,Sea level rise can also benefit human beings. ...,'S3T3C44',benefit,human,water,3,3,44,3


In [79]:
FINAL_COL_NAME_LIST = [COL_S_ID_NAME, COL_T_ID_NAME, COL_C_ID_NAME, COL_KEY_1_NAME, COL_KEY_2_NAME, COL_KEY_3_NAME, COL_DESCRIPTION_NAME]

In [80]:
df_1 = df[FINAL_COL_NAME_LIST]

In [81]:
df_1

Unnamed: 0,scenario_id,type_id,local_card_id,keyword1,keyword2,keyword3,description
0,1,1,16,ME,Asia,Influenza,"A long-term partner with AAA,the company ME is..."
1,1,1,17,ME,expansion,AAA,Suppose AAA tries to expand its business secto...
2,1,1,1,ME,growth,revenue,ME had stable growth in their sales revenue of...
3,1,1,18,ME,AAA,relationship,AAA\'s board members maintained a good relatio...
4,1,1,25,ME,guilty,false promotion,The company pleaded guilty to the false promot...
...,...,...,...,...,...,...,...
383,3,3,32,melt,damage,human,The melting ice glaciers can cause massive dam...
384,3,3,42,sea level,false topic,difference,Sea level rising can be a false topic. There a...
385,3,3,43,geologic,change,change,The change in the geologic structures under th...
386,3,3,44,benefit,human,water,Sea level rise can also benefit human beings. ...


## Prepare to Save

In [82]:
df_final = df_1.sort_values(axis=0, by=[COL_S_ID_NAME, COL_T_ID_NAME, COL_C_ID_NAME]).reset_index()[FINAL_COL_NAME_LIST]
df_final

Unnamed: 0,scenario_id,type_id,local_card_id,keyword1,keyword2,keyword3,description
0,1,1,1,ME,growth,revenue,ME had stable growth in their sales revenue of...
1,1,1,2,ME,lobbying,expenditures,To maintain the advantage in bargaining with g...
2,1,1,3,NJ,vaccine,effectiveness,The company is developing a new generation vac...
3,1,1,4,NJ,patent,influenza,NJ owns research centers in East Asia and part...
4,1,1,5,NJ,partnerships,factories,NJ partners with local manufacturers and owns ...
...,...,...,...,...,...,...,...
383,3,3,41,mitigate,flooding,climate,"In Netherlands and Germany, climate dikes were..."
384,3,3,42,sea level,false topic,difference,Sea level rising can be a false topic. There a...
385,3,3,43,geologic,change,change,The change in the geologic structures under th...
386,3,3,44,benefit,human,water,Sea level rise can also benefit human beings. ...


In [85]:
df_final.to_csv("cards.csv", quoting=2, quotechar='"')