#### Loading packages

In [1]:
import glob
import errno
import string
import numpy as np
import csv

#### Creating corpus

In [2]:
truth = []
path = 'data/ground_truth/*.txt'
files = glob.glob(path)
for name in files:
    try:
        with open(name, encoding='utf8') as f:
            for line in f:
                out = line.translate(str.maketrans('', '', string.punctuation))
                out = ''.join([i for i in out if not i.isdigit()])
                out = out.lower().split()
                truth.extend(out)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise
np.savetxt('output/truth_corpus.dat', truth, fmt='%s', encoding='utf8')

tess = []
path = 'data/tesseract/*.txt'
files = glob.glob(path)
for name in files:
    try:
        with open(name, encoding='utf8') as f:
            for line in f:
                out = line.translate(str.maketrans('', '', string.punctuation))
                out = ''.join([i for i in out if not i.isdigit()])
                out = out.lower().split()
                tess.extend(out)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise
np.savetxt('output/tess_corpus.dat', tess, fmt='%s', encoding='utf8')

#### Reading data for feature extraction

In [3]:
Error = []
Truth = []
pair = []
with open('data/Error_df_rules_based.csv', encoding='utf8') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        err = row[1].lower()
        trt = row[2].lower()
        if err != trt and [err, trt] not in pair:
            Error.append(err)
            Truth.append(trt)
            pair.append([err, trt])
            
Error = Error[2:]
Truth = Truth[2:]

#### Feature extraction

In [4]:
from lib.feature_scoring import n_gram
from lib.feature_scoring import candidate_search
from lib.feature_scoring import LED_score
from lib.feature_scoring import SS_score
from lib.feature_scoring import LP_score
from lib.feature_scoring import ECP_score
from lib.feature_scoring import RCP_score

In [5]:
W_error=['Typo']
W_truth=['Truth']
W_cand = ['Candidate']
Label = ['Label']
LED = ['led_score']
SS = ['ss_score']
LP = ['lp_score']
ECP = ['ECP_score']

n = 3 # n_gram
for i in range(len(Error)):
    w_e = Error[i]
    w_c = Truth[i]
    cand_list = candidate_search(truth, w_e)
    print('word ',i+1,', error: ', w_e, ', truth: ', w_c)
    
#    gram_list = n_gram(w_e, tess, n)
    LP_freq = []
#    ECP_freq = []
    for s in cand_list:
        lp_freq = LP_score(s, truth)
        LP_freq.append(lp_freq)
#        ecp_freq = ECP_score(gram_list, s, truth, n)
#       ECP_freq.append(ecp_freq)
        
    for j in range(len(cand_list)):
        s = cand_list[j]
        led = LED_score(w_e, s)
        ss = SS_score(w_e, s, N=3)
        lp = LP_score(s, truth)/max(LP_freq)
#        if max(ECP_freq)==0: ecp=0
#        else: ecp = ECP_score(gram_list, s, truth, n)/max(ECP_freq)
#        rcp = RCP_score(w_e, s, tess, truth)
        label = int(s == w_c)
#        print('candidate:', s, '\tscores =', '{:03.2f}'.format(led),', {:03.2f}'.format(ss),', {:06.5f}'.format(lp), '\tlabel=', label)
        W_error.append(w_e)
        W_truth.append(w_c)
        W_cand.append(s)
        Label.append(label)
        LED.append(led)
        SS.append(ss)
        LP.append(lp)
        #ECP.append(ecp)

word  1 , error:  willlam , truth:  william
word  2 , error:  nvolvng , truth:  involving
word  3 , error:  t , truth:  the
word  4 , error:  aflcid , truth:  aflcio
word  5 , error:  cmng , truth:  attaching
word  6 , error:  admlnlstratlons , truth:  administrations
word  7 , error:  d , truth:  gold
word  8 , error:  jurlsdlctlon , truth:  jurisdiction
word  9 , error:  m , truth:  to
word  10 , error:  mm , truth:  bill
word  11 , error:  prohlblt , truth:  prohibit
word  12 , error:  mg , truth:  ing
word  13 , error:  contrlbute , truth:  contribute
word  14 , error:  reportlng , truth:  reporting
word  15 , error:  statlstlcs , truth:  statistics
word  16 , error:  worklng , truth:  working
word  17 , error:  condltlons , truth:  conditions
word  18 , error:  dlstrlbutlon , truth:  distribution
word  19 , error:  advertlslng , truth:  advertising
word  20 , error:  commlttee , truth:  committee
word  21 , error:  cm , truth:  cma
word  22 , error:  a , truth:  qa
word  23 , erro

word  176 , error:  m , truth:  in
word  177 , error:  nustry , truth:  industry
word  178 , error:  thls , truth:  is
word  179 , error:  posltlve , truth:  positive
word  180 , error:  organlzlng , truth:  organizing
word  181 , error:  meanwhlle , truth:  meanwhile
word  182 , error:  ommssmns , truth:  commissions
word  183 , error:  accordlng , truth:  according
word  184 , error:  eondltl , truth:  eonditi
word  185 , error:  ntlng , truth:  nting
word  186 , error:  unerstandlng , truth:  understanding
word  187 , error:  nterdcpllnary , truth:  interdisciplinary
word  188 , error:  mlsslon , truth:  mission
word  189 , error:  attalnlng , truth:  attaining
word  190 , error:  asslgned , truth:  assigned
word  191 , error:  mlbcommlttee , truth:  mibcommittee
word  192 , error:  nd , truth:  finds
word  193 , error:  sclentlsts , truth:  scientists
word  194 , error:  multlple , truth:  multiple
word  195 , error:  descrlbed , truth:  described
word  196 , error:  succlnctly , t

word  346 , error:  scrlptlon , truth:  scription
word  347 , error:  antlclpated , truth:  anticipated
word  348 , error:  mmmum , truth:  minimum
word  349 , error:  mmmcmnms , truth:  manufacturing
word  350 , error:  llmlted , truth:  limited
word  351 , error:  washlngton , truth:  washington
word  352 , error:  servlng , truth:  serving
word  353 , error:  crltlcal , truth:  critical
word  354 , error:  facllltate , truth:  facilitate
word  355 , error:  mcal , truth:  mca
word  356 , error:  beglns , truth:  begins
word  357 , error:  outllnlng , truth:  outlining
word  358 , error:  sollcltlng , truth:  soliciting
word  359 , error:  partlclpatlon , truth:  participation
word  360 , error:  meanlngful , truth:  meaningful
word  361 , error:  provldlng , truth:  providing
word  362 , error:  contrlbutlon , truth:  contribution
word  363 , error:  permltted , truth:  permitted
word  364 , error:  dlmlnlsh , truth:  diminish
word  365 , error:  onsithlrd , truth:  onethird
word  3

word  517 , error:  confllcts , truth:  conflicts
word  518 , error:  remalnlng , truth:  remaining
word  519 , error:  requlrendeed , truth:  requireindeed
word  520 , error:  technlcallegal , truth:  technicallegal
word  521 , error:  partnershlp , truth:  partnership
word  522 , error:  maxlmlze , truth:  maximize
word  523 , error:  porarlly , truth:  porarily
word  524 , error:  strengthisapplng , truth:  strengthsapping
word  525 , error:  exstsbut , truth:  existsbut
word  526 , error:  soph , truth:  sophis
word  527 , error:  ommltteestaff , truth:  committeestaff
word  528 , error:  necesslty , truth:  necessity
word  529 , error:  bulldlng , truth:  building
word  530 , error:  properlymage , truth:  properlyimage
word  531 , error:  t , truth:  importantly
word  532 , error:  unportantly , truth:  i
word  533 , error:  clvll , truth:  civil
word  534 , error:  stcy , truth:  swmc
word  535 , error:  stc , truth:  swmc
word  536 , error:  forebodlngs , truth:  forebodings
wo

word  688 , error:  mstmbm , truth:  distribu
word  689 , error:  twoidlglt , truth:  twodigit
word  690 , error:  ddt , truth:  dot
word  691 , error:  hmrld , truth:  hm
word  692 , error:  m , truth:  hm
word  693 , error:  vltlng , truth:  inviting
word  694 , error:  hmrlm , truth:  hm
word  695 , error:  whlch , truth:  each
word  696 , error:  m , truth:  mtb
word  697 , error:  swltchlng , truth:  switching
word  698 , error:  justlflcatlon , truth:  justification
word  699 , error:  contalnlng , truth:  containing
word  700 , error:  c , truth:  charge
word  701 , error:  shlpment , truth:  shipment
word  702 , error:  handlln , truth:  handling
word  703 , error:  denty , truth:  identify
word  704 , error:  acountlng , truth:  accounting
word  705 , error:  dentcatlon , truth:  identification
word  706 , error:  frelght , truth:  freight
word  707 , error:  welght , truth:  weight
word  708 , error:  welghts , truth:  weights
word  709 , error:  phyl , truth:  physical
word 

word  862 , error:  characterlstlcs , truth:  characteristics
word  863 , error:  solublllty , truth:  solubility
word  864 , error:  cr , truth:  cross
word  865 , error:  f , truth:  f
word  866 , error:  photolytlc , truth:  photolytic
word  867 , error:  reactlvlty , truth:  reactivity
word  868 , error:  w , truth:  sip
word  869 , error:  fl , truth:  j
word  870 , error:  propose , truth:  proposed
word  871 , error:  naa , truth:  naaqs
word  872 , error:  md , truth:  mid
word  873 , error:  adc , truth:  aqc
word  874 , error:  perslstence , truth:  persistence
word  875 , error:  severlty , truth:  severity
word  876 , error:  trlled , truth:  trolled
word  877 , error:  m , truth:  will
word  878 , error:  rlna , truth:  bd
word  879 , error:  lllustrate , truth:  illustrate
word  880 , error:  h , truth:  that
word  881 , error:  e , truth:  like
word  882 , error:  m , truth:  str
word  883 , error:  s , truth:  us
word  884 , error:  b , truth:  billion
word  885 , erro

word  1038 , error:  obtalnlng , truth:  obtaining
word  1039 , error:  foundatlns , truth:  foundations
word  1040 , error:  actlvltls , truth:  activities
word  1041 , error:  endorsemnt , truth:  endorsement
word  1042 , error:  sollclted , truth:  solicited
word  1043 , error:  commlttee , truth:  with
word  1044 , error:  mm , truth:  gulf
word  1045 , error:  nacdsm , truth:  nacosh
word  1046 , error:  ncludng , truth:  including
word  1047 , error:  ncdsh , truth:  nacosh
word  1048 , error:  technlca , truth:  technical
word  1049 , error:  wlthdrawn , truth:  withdrawn
word  1050 , error:  approxlmtely , truth:  approximately
word  1051 , error:  nmmlttees , truth:  committees
word  1052 , error:  technlcal , truth:  ground
word  1053 , error:  relatlvly , truth:  relatively
word  1054 , error:  sufflclently , truth:  sufficiently
word  1055 , error:  vlsltln , truth:  visiting
word  1056 , error:  mm , truth:  edf
word  1057 , error:  anythlng , truth:  anything
word  1058 ,

word  1202 , error:  compnes , truth:  companies
word  1203 , error:  pollce , truth:  policies
word  1204 , error:  dwsl , truth:  qwp
word  1205 , error:  a , truth:  osha
word  1206 , error:  nvstgatlons , truth:  investigations
word  1207 , error:  sc , truth:  scientists
word  1208 , error:  entlsts , truth:  at
word  1209 , error:  flfteen , truth:  at
word  1210 , error:  mm , truth:  possible
word  1211 , error:  foundatlons , truth:  foundations
word  1212 , error:  lwp , truth:  qwp
word  1213 , error:  assclatlons , truth:  associations
word  1214 , error:  llvlng , truth:  living
word  1215 , error:  p , truth:  is
word  1216 , error:  survlvlng , truth:  surviving
word  1217 , error:  thn , truth:  then
word  1218 , error:  beneflts , truth:  benefits
word  1219 , error:  retlrlng , truth:  retiring
word  1220 , error:  cntlnuous , truth:  continuous
word  1221 , error:  c , truth:  f
word  1222 , error:  quallfy , truth:  qualify
word  1223 , error:  wm , truth:  while
w

word  1371 , error:  conflrmatlon , truth:  confirmation
word  1372 , error:  asslgnlng , truth:  assigning
word  1373 , error:  determlnlng , truth:  determining
word  1374 , error:  asslgnments , truth:  assignments
word  1375 , error:  mmsnmm , truth:  president
word  1376 , error:  yardstlck , truth:  yardstick
word  1378 , error:  n , truth:  w
word  1379 , error:  returnlng , truth:  returning
word  1380 , error:  cuttlng , truth:  cutting
word  1381 , error:  bllllomi , truth:  billion
word  1382 , error:  lacklng , truth:  lacking
word  1383 , error:  ms , truth:  irs
word  1384 , error:  clarlfylng , truth:  clarifying
word  1385 , error:  tlvlty , truth:  tivity
word  1386 , error:  ubmlsslon , truth:  submission
word  1387 , error:  dlvlded , truth:  divided
word  1388 , error:  a , truth:  compani
word  1389 , error:  m , truth:  with
word  1390 , error:  emusyicousvmms , truth:  energyconsuming
word  1391 , error:  mdyna , truth:  moyna
word  1392 , error:  addltlve , trut

word  1536 , error:  flrms , truth:  firms
word  1537 , error:  boycottlng , truth:  boycotting
word  1538 , error:  fullflllment , truth:  fullfillment
word  1539 , error:  sprawllng , truth:  sprawling
word  1540 , error:  overlapplng , truth:  overlapping
word  1541 , error:  dlspleasure , truth:  displeasure
word  1542 , error:  ditx , truth:  dtx
word  1543 , error:  tlght , truth:  tight
word  1544 , error:  unconstltutlonal , truth:  unconstitutional
word  1545 , error:  qulckly , truth:  quickly
word  1546 , error:  efflclenc , truth:  efficiency
word  1547 , error:  submlts , truth:  submits
word  1548 , error:  dlscharged , truth:  discharged
word  1549 , error:  abollsh , truth:  abolish
word  1550 , error:  r , truth:  nor
word  1551 , error:  e , truth:  he
word  1552 , error:  a , truth:  any
word  1553 , error:  tr , truth:  transportation
word  1554 , error:  ctlvltles , truth:  activities
word  1555 , error:  chlldren , truth:  children
word  1556 , error:  commm , tru

word  1696 , error:  fertlllzers , truth:  fertilizers
word  1697 , error:  brlnglng , truth:  bringing
word  1698 , error:  englneersa , truth:  engineersa
word  1699 , error:  hlrlng , truth:  hiring
word  1700 , error:  mlnlstrators , truth:  ministrators
word  1701 , error:  ac , truth:  acs
word  1702 , error:  flrstitlme , truth:  firsttime
word  1703 , error:  dm , truth:  dio
word  1704 , error:  tmeson , truth:  timeson
word  1705 , error:  partlclpant , truth:  participant
word  1706 , error:  drthodontlsts , truth:  orthodontists
word  1707 , error:  slxﬂnlnute , truth:  sixminute
word  1708 , error:  lelng , truth:  living
word  1709 , error:  reprlnt , truth:  reprint
word  1710 , error:  chemlcalsone , truth:  chemicalsone
word  1711 , error:  typlcal , truth:  typical
word  1712 , error:  mlllan , truth:  million
word  1713 , error:  multlpllers , truth:  multipliers
word  1714 , error:  flreflghter , truth:  firefighter
word  1715 , error:  ramlflcatlons , truth:  ramif

word  1865 , error:  l , truth:  list
word  1866 , error:  nestlng , truth:  nesting
word  1867 , error:  jolntly , truth:  jointly
word  1868 , error:  sclentlflcally , truth:  scientifically
word  1869 , error:  polychlorlnated , truth:  polychlorinated
word  1870 , error:  pcb , truth:  pcbs
word  1871 , error:  conductlng , truth:  conducting
word  1872 , error:  analyzlng , truth:  analyzing
word  1873 , error:  nonimetalllc , truth:  nonmetalllc
word  1874 , error:  bmn , truth:  bn
word  1875 , error:  nonimetalllc , truth:  nonmetallic
word  1876 , error:  rmommmnmus , truth:  recommendations
word  1877 , error:  impllcatlons , truth:  implications
word  1878 , error:  mbmmlde , truth:  dibromlde
word  1879 , error:  reconstltuted , truth:  reconstituted
word  1880 , error:  dlstrlbutors , truth:  distributors
word  1881 , error:  dlsbandlng , truth:  disbanding
word  1882 , error:  mm , truth:  ibt
word  1883 , error:  rabblts , truth:  rabbits
word  1884 , error:  mcammmnmus

word  2035 , error:  ms , truth:  zinc
word  2036 , error:  dlthlophosphates , truth:  dlthiophosphates
word  2037 , error:  st , truth:  first
word  2038 , error:  ntlatd , truth:  initiated
word  2039 , error:  noiaifact , truth:  noafact
word  2040 , error:  eastlng , truth:  easting
word  2041 , error:  indlvldual , truth:  individual
word  2042 , error:  researc , truth:  research
word  2043 , error:  expense , truth:  expenses
word  2044 , error:  polychlrlnated , truth:  polychlorinated
word  2045 , error:  vinyllne , truth:  vinylidene
word  2046 , error:  dlkyl , truth:  dialkyl
word  2047 , error:  ddltlves , truth:  additives
word  2048 , error:  trlchlroethylene , truth:  trichloroethylene
word  2049 , error:  gtbee , truth:  gamitbee
word  2050 , error:  ccnmlttee , truth:  ccnmittee
word  2051 , error:  llndbergh , truth:  lindbergh
word  2052 , error:  chalrmn , truth:  chairman
word  2053 , error:  cibaighigy , truth:  cibageigy
word  2054 , error:  bfoodrlch , truth:  

word  2203 , error:  pst , truth:  past
word  2204 , error:  expresslng , truth:  expressing
word  2205 , error:  swmm , truth:  superfund
word  2206 , error:  dlstrlc , truth:  district
word  2207 , error:  dlsm , truth:  dismiss
word  2208 , error:  dumpsltes , truth:  dumpsites
word  2209 , error:  flrlo , truth:  florio
word  2210 , error:  eps , truth:  epas
word  2211 , error:  mnkms , truth:  dinkins
word  2212 , error:  crltlzed , truth:  critized
word  2213 , error:  s , truth:  and
word  2214 , error:  exportif , truth:  exportof
word  2215 , error:  notlflctlon , truth:  notification
word  2216 , error:  admlnlsratlon , truth:  administration
word  2217 , error:  prellmlnarles , truth:  preliminaries
word  2218 , error:  try , truth:  will
word  2219 , error:  ignlflcant , truth:  significant
word  2220 , error:  washlngon , truth:  washington
word  2221 , error:  m , truth:  milt
word  2222 , error:  dlstrlbted , truth:  distributed
word  2223 , error:  u , truth:  and
word

word  2370 , error:  thlng , truth:  thing
word  2371 , error:  electrlcal , truth:  electrical
word  2372 , error:  dlsconnectlng , truth:  disconnecting
word  2373 , error:  gb , truth:  sight
word  2374 , error:  elthernstallatlon , truth:  eitherinstallation
word  2375 , error:  slght , truth:  sight
word  2376 , error:  sultch , truth:  switch
word  2377 , error:  stralghtforward , truth:  straightforward
word  2378 , error:  classlflcatlons , truth:  classifications
word  2379 , error:  compllcatlon , truth:  complication
word  2380 , error:  turnplke , truth:  turnpike
word  2381 , error:  foreslghted , truth:  foresighted
word  2382 , error:  federallsm , truth:  federalism
word  2383 , error:  mgnly , truth:  highly
word  2384 , error:  maklng , truth:  making
word  2385 , error:  vixls , truth:  vixis
word  2386 , error:  slmllarly , truth:  similarly
word  2387 , error:  colnclde , truth:  coincide
word  2388 , error:  dlvldends , truth:  dividends
word  2389 , error:  dlre

word  2535 , error:  ppm , truth:  odcb
word  2536 , error:  specles , truth:  species
word  2537 , error:  mrsy , truth:  ets
word  2538 , error:  m , truth:  isobu
word  2539 , error:  announclng , truth:  announcing
word  2540 , error:  stdy , truth:  study
word  2541 , error:  m , truth:  eo
word  2542 , error:  slnmnnn , truth:  s
word  2543 , error:  redlstrlbutlon , truth:  redistribution
word  2544 , error:  currnt , truth:  current
word  2545 , error:  mmn , truth:  egee
word  2546 , error:  egmm , truth:  egme
word  2547 , error:  mm , truth:  egmea
word  2548 , error:  publlsh , truth:  publish
word  2549 , error:  cm , truth:  cna
word  2550 , error:  zlnc , truth:  zinc
word  2551 , error:  testlnq , truth:  testinq
word  2552 , error:  mposslble , truth:  impossible
word  2553 , error:  electlng , truth:  electing
word  2554 , error:  recalcltrant , truth:  recalcitrant
word  2555 , error:  unportantly , truth:  importantly
word  2556 , error:  mlnlmum , truth:  minimum
w

word  2700 , error:  storyllnes , truth:  storylines
word  2701 , error:  msncy , truth:  oshc
word  2702 , error:  publlclsed , truth:  publicised
word  2703 , error:  accompllshi , truth:  accomplish
word  2704 , error:  convlnclng , truth:  convincing
word  2705 , error:  selllng , truth:  selling
word  2706 , error:  pollcymakers , truth:  policymakers
word  2707 , error:  coeaualcatlons , truth:  coeauaicatlons
word  2708 , error:  screenlng , truth:  screening
word  2709 , error:  a , truth:  ossa
word  2710 , error:  coaaunleatlons , truth:  coaaunieations
word  2711 , error:  lbs , truth:  the
word  2712 , error:  cntlnued , truth:  continued
word  2713 , error:  stff , truth:  staff
word  2714 , error:  rm , truth:  th
word  2715 , error:  chlzman , truth:  chairman
word  2716 , error:  tlns , truth:  tions
word  2717 , error:  redoubllng , truth:  redoubling
word  2718 , error:  efflclent , truth:  efficient
word  2719 , error:  consunlty , truth:  consunity
word  2720 , erro

word  2867 , error:  chrlstl , truth:  christi
word  2868 , error:  nsufflcent , truth:  insufficient
word  2869 , error:  xtyx , truth:  sixtysix
word  2870 , error:  presentstlons , truth:  presentstions
word  2871 , error:  onglng , truth:  ongoing
word  2872 , error:  dupllctlon , truth:  duplication
word  2873 , error:  ellmlnat , truth:  eliminate
word  2874 , error:  mm , truth:  officials
word  2875 , error:  ndcai , truth:  indica
word  2876 , error:  amendmnts , truth:  amendments
word  2877 , error:  mt , truth:  met
word  2878 , error:  sklnner , truth:  skinner
word  2879 , error:  p , truth:  posi
word  2880 , error:  solldhazardous , truth:  solidhazardous
word  2881 , error:  iu , truth:  du
word  2882 , error:  vs , truth:  us
word  2883 , error:  addltlo , truth:  addition
word  2884 , error:  dlstrk , truth:  distri
word  2885 , error:  norrls , truth:  norris
word  2886 , error:  cieargbigy , truth:  cibagbigy
word  2887 , error:  verlflcatln , truth:  verification


word  3037 , error:  sltln , truth:  siting
word  3038 , error:  landfllllng , truth:  landfilling
word  3039 , error:  c , truth:  off
word  3040 , error:  transporttlon , truth:  transportation
word  3041 , error:  pennsylva , truth:  pennsylvania
word  3042 , error:  ms , truth:  hb
word  3043 , error:  prenotlflcatlon , truth:  prenotification
word  3044 , error:  shlpment , truth:  shipments
word  3045 , error:  deflnllve , truth:  definitive
word  3046 , error:  wnrklng , truth:  working
word  3047 , error:  ndstrys , truth:  industrys
word  3048 , error:  advrtlslng , truth:  advertising
word  3049 , error:  cltlcal , truth:  critical
word  3050 , error:  flrsn , truth:  first
word  3051 , error:  respctlvely , truth:  respectively
word  3052 , error:  a , truth:  issue
word  3053 , error:  nlckecontalnlngcatalysts , truth:  nickelcontainingcatalysts
word  3054 , error:  oleylmne , truth:  oleylamine
word  3055 , error:  g , truth:  task
word  3056 , error:  slngley , truth:  si

word  3201 , error:  hablcht , truth:  habicht
word  3202 , error:  mmmpy , truth:  uarep
word  3203 , error:  workablllty , truth:  workability
word  3204 , error:  nertng , truth:  inerting
word  3205 , error:  llstlng , truth:  listing
word  3206 , error:  cosvmmcmw , truth:  costeffective
word  3207 , error:  feedstdck , truth:  feedstock
word  3208 , error:  mmmnmu , truth:  information
word  3209 , error:  productlvlty , truth:  improved
word  3210 , error:  by , truth:  x
word  3211 , error:  electrc , truth:  electric
word  3212 , error:  shas , truth:  oshas
word  3213 , error:  intrlnslcally , truth:  intrinsically
word  3214 , error:  mm , truth:  process
word  3215 , error:  e , truth:  computer
word  3216 , error:  hlghly , truth:  successful
word  3217 , error:  umcmn , truth:  director
word  3218 , error:  hdltzman , truth:  holtzman
word  3219 , error:  purchaslng , truth:  purchasing
word  3220 , error:  m , truth:  wi
word  3221 , error:  an , truth:  and
word  3222 ,

word  3374 , error:  grld , truth:  grid
word  3375 , error:  mcsy , truth:  mecs
word  3376 , error:  mm , truth:  epri
word  3377 , error:  dellghted , truth:  delighted
word  3378 , error:  dlsslmllar , truth:  dissimilar
word  3379 , error:  subscrlbei , truth:  subscribe___
word  3380 , error:  nterestclasslcally , truth:  interestclassically
word  3381 , error:  conmlttees , truth:  conmittees
word  3382 , error:  admltted , truth:  admitted
word  3383 , error:  certlfles , truth:  certifies
word  3384 , error:  etc , truth:  btc
word  3385 , error:  sklnmed , truth:  skinmed
word  3386 , error:  nbc , truth:  nec
word  3387 , error:  physlcally , truth:  physically
word  3388 , error:  mllllseconds , truth:  milliseconds
word  3389 , error:  dentlflable , truth:  identifiable
word  3390 , error:  contrlbutes , truth:  contributes
word  3391 , error:  identlflcatlon , truth:  identification
word  3392 , error:  ne , truth:  nec
word  3393 , error:  crlses , truth:  crises
word  3

word  3541 , error:  enmgy , truth:  energy
word  3542 , error:  major , truth:  majority
word  3543 , error:  joh , truth:  john
word  3544 , error:  drnjrl , truth:  dnj
word  3545 , error:  dou , truth:  doug
word  3546 , error:  jm , truth:  jim
word  3547 , error:  h , truth:  dks
word  3548 , error:  korsk , truth:  sikorski
word  3549 , error:  drmnra , truth:  dmn
word  3550 , error:  jams , truth:  james
word  3551 , error:  edwrd , truth:  edward
word  3552 , error:  whlttaker , truth:  whittaker
word  3553 , error:  thoas , truth:  thomas
word  3554 , error:  bllley , truth:  bliley
word  3555 , error:  rivai , truth:  rva
word  3556 , error:  howrd , truth:  howard
word  3557 , error:  riuti , truth:  rut
word  3558 , error:  fre , truth:  fred
word  3559 , error:  rinyid , truth:  rny
word  3560 , error:  cdngm , truth:  congman
word  3561 , error:  commltees , truth:  committees
word  3562 , error:  guldnce , truth:  guidance
word  3563 , error:  ath_cked , truth:  attac

word  3708 , error:  castlng , truth:  casting
word  3709 , error:  llluslon , truth:  illusion
word  3710 , error:  unavallablllty , truth:  unavailability
word  3711 , error:  classlfylng , truth:  classifying
word  3712 , error:  ratlflcatlon , truth:  ratification
word  3713 , error:  csccy , truth:  cscc
word  3714 , error:  nformatn , truth:  information
word  3715 , error:  we , truth:  unep
word  3716 , error:  rlghticoi , truth:  rightco
word  3717 , error:  decd , truth:  oecds
word  3718 , error:  notcatlon , truth:  notification
word  3719 , error:  whst , truth:  whmxs
word  3720 , error:  asslmllate , truth:  assimilate
word  3721 , error:  mmvwcmy , truth:  unepunctc
word  3722 , error:  ngd , truth:  ngo
word  3723 , error:  chmlcal , truth:  chmical
word  3724 , error:  unllmlted , truth:  unlimited
word  3725 , error:  nonifulflllment , truth:  nonfulfillment
word  3726 , error:  usisraell , truth:  usisraeli
word  3727 , error:  opmmny , truth:  opinion
word  3728 , 

word  3870 , error:  mmms , truth:  during
word  3871 , error:  mm , truth:  our
word  3872 , error:  cdmfdrtably , truth:  comfortably
word  3873 , error:  ownwammmsm , truth:  overwhelmingly
word  3874 , error:  mvnmmmm , truth:  environment
word  3875 , error:  mmsgms , truth:  presents
word  3876 , error:  compmmms , truth:  components
word  3877 , error:  cammm , truth:  chemnet
word  3878 , error:  mmmsm , truth:  promised
word  3879 , error:  mmmmmu , truth:  information
word  3880 , error:  pnmmm , truth:  printout
word  3881 , error:  camsmm , truth:  combined
word  3882 , error:  covmzmc , truth:  covering
word  3883 , error:  virtualli , truth:  virtually
word  3884 , error:  ndnremergency , truth:  nonemergency
word  3885 , error:  thedistrieutidn , truth:  thedistribution
word  3886 , error:  commmm , truth:  component
word  3887 , error:  tidn , truth:  tion
word  3888 , error:  muvsnzy , truth:  industry
word  3889 , error:  mmvnm , truth:  provide
word  3890 , error:  m

word  4037 , error:  etngs , truth:  meetings
word  4038 , error:  slgnments , truth:  assignments
word  4039 , error:  meetlns , truth:  meetings
word  4040 , error:  houseente , truth:  housesenate
word  4041 , error:  leaderhlp , truth:  leadership
word  4042 , error:  hbv , truth:  way
word  4043 , error:  eadershlp , truth:  leadership
word  4044 , error:  indlvlwual , truth:  individual
word  4045 , error:  admlnlstrtlon , truth:  administration
word  4046 , error:  dentctlon , truth:  identification
word  4047 , error:  sslgnments , truth:  assignments
word  4048 , error:  transmlttl , truth:  transmittal
word  4049 , error:  assoclatlns , truth:  associations
word  4050 , error:  drlgl , truth:  original
word  4051 , error:  flrt , truth:  first
word  4052 , error:  grammikudmaniﬂolllngs , truth:  grammrudmanhollings
word  4053 , error:  restrlt , truth:  restrict
word  4054 , error:  grannikudmaniﬂolllngs , truth:  grammrudmanhollings
word  4055 , error:  electln , truth:  ele

word  4199 , error:  assoclatlons , truth:  associations
word  4200 , error:  mm , truth:  tcc
word  4201 , error:  statistlcs , truth:  statistics
word  4202 , error:  advlsablllty , truth:  advisability
word  4203 , error:  departmentdsha , truth:  departmentosha
word  4204 , error:  rlng , truth:  ring
word  4205 , error:  m , truth:  e
word  4206 , error:  klngs , truth:  kings
word  4207 , error:  pollcythose , truth:  policythose
word  4208 , error:  slnce , truth:  since
word  4209 , error:  relnforclng , truth:  reinforcing
word  4210 , error:  rhlneﬂilvertype , truth:  rhinerivertype
word  4211 , error:  inhlblted , truth:  inhibited
word  4212 , error:  chemnetcaeralr , truth:  chemnetcaerair
word  4213 , error:  leltlng , truth:  limiting
word  4214 , error:  grovndwater , truth:  groundwater
word  4215 , error:  s , truth:  csis
word  4216 , error:  ertlfled , truth:  ertified
word  4217 , error:  transportlng , truth:  transporting
word  4218 , error:  fulfllllng , truth: 

word  4363 , error:  cmaai , truth:  cmaaihcapinacapma
word  4364 , error:  capinacapma , truth:  july
word  4365 , error:  carclnoenlclty , truth:  carcinogenicity
word  4366 , error:  jeffrson , truth:  jefferson
word  4367 , error:  teratolgy , truth:  teratology
word  4368 , error:  l , truth:  louisiana
word  4369 , error:  lca , truth:  is
word  4370 , error:  commlslon , truth:  commission
word  4371 , error:  represnts , truth:  represents
word  4372 , error:  memberhlps , truth:  memberships
word  4373 , error:  t , truth:  title
word  4374 , error:  e , truth:  iii
word  4375 , error:  iii , truth:  for
word  4376 , error:  ccae , truth:  the
word  4377 , error:  lowlng , truth:  it
word  4378 , error:  dlrectrs , truth:  directors
word  4379 , error:  requlrements , truth:  requirements
word  4380 , error:  workln , truth:  working
word  4381 , error:  shw , truth:  show
word  4382 , error:  exlblllty , truth:  flexibility
word  4383 , error:  cmngress , truth:  congress
wor

word  4530 , error:  cy , truth:  policy
word  4531 , error:  unclls , truth:  councils
word  4532 , error:  wve , truth:  weve
word  4533 , error:  ie , truth:  the
word  4534 , error:  sacrlflclng , truth:  sacrificing
word  4535 , error:  llns , truth:  lines
word  4536 , error:  mprmve , truth:  improve
word  4537 , error:  uptlck , truth:  uptick
word  4538 , error:  concrns , truth:  concerns
word  4539 , error:  earnlng , truth:  earning
word  4540 , error:  ademark , truth:  trademark
word  4541 , error:  hrltte , truth:  written
word  4542 , error:  apprlsed , truth:  apprised
word  4543 , error:  apprval , truth:  approval
word  4544 , error:  crtlfy , truth:  certify
word  4545 , error:  assssment , truth:  assessment
word  4546 , error:  restrltlons , truth:  restrictions
word  4547 , error:  audlts , truth:  audits
word  4548 , error:  e , truth:  epa
word  4549 , error:  tlghtens , truth:  tightens
word  4550 , error:  nustrywde , truth:  industrywide
word  4551 , error: 

In [12]:
np.savetxt('output/feature.csv', [p for p in zip(W_error, W_truth, W_cand, LED, SS, LP, Label)], delimiter=',', fmt='%s', encoding='utf-8')