<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/dave-updates/code/dave_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
train_url = 'https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/WikiLarge_Train.csv'

In [3]:
train_df = pd.read_csv(train_url)

In [5]:
train_df.sample(20)[['original_text', 
                     'label']].style.set_properties(subset=['original_text'], 
                                                    **{'width': '500px'})

Unnamed: 0,original_text,label
394408,"MNM made their debut in WWE on the April 14 , 2005 edition of SmackDown !",0
224848,Fingers and thumbs are types of digits .,0
245513,Origins,0
278946,Crest,0
85189,Knol is a Google project that aims to include user-written articles on a range of topics .,1
121957,"C-USA was founded in 1995 by the merger of the Metro Conference and Great Midwest Conference , two Division I conferences that did not sponsor football .",1
304572,"New talent such as Triple H and his D-Generation X faction , Mankind and The Rock were elevated to main event status on the WWF 's program .",0
39046,"He played in 24 seasons in the National Hockey League for the Toronto Maple Leafs , New York Rangers , Pittsburgh Penguins , and Buffalo Sabres .",1
209835,"Classical civilizations , notably the Persians , Macedonians , Nubians , Greeks , Parthians , Indians , Japanese , Chinese , and Koreans , fielded large numbers of archers in their armies .",0
166947,"The MiG-29 , along with the Sukhoi Su-27 , were developed to counter new American fighters such as the McDonnell Douglas F-15 Eagle , and the General Dynamics F-16 Fighting Falcon .",1


In [6]:
# Observations of things that need cleaning:
# join " 's " with their associated words; same with contractions  - done
# Some sentences are partials of other sentences within the corpus
# Need to address accents - not sure if I do
# Need to address punctuation (vectorizers) - done
# Address weird quotations - (example at index 11005) '' yogurt ' ''
# Need to address -LRB and -RRB which are lemma references to left 
# and right parentheses - done
# odd character encodings: 
  # Ã translates to é (example at index 207236)
  # à is encoding for ë (example at index 125745)


use ftfy package


#  Other Observations/Thoughts on What Constitutes Difficulty:
  # difficulty can be a combination of hard words, hard to pronounce or unfamiliar
  # names, long-run on sentences, harder topics (eg an example was Linux kernel or
  # referring to dog breeds as an example), or just non-sensical sentences without
  # context




In [7]:
# Found a solution to help with contractions
!pip install contractions
!pip install gensim
!pip install ftfy
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 14.4 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 72.0 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting 

In [17]:
import re
import contractions
from gensim.utils import simple_preprocess
import ftfy
from unidecode import unidecode

def text_processing(s):

  # come up with a list of contractions to evaluate
  contract_lst = ['\'ve','\'ll','\'d','n\'t']

  # contractions were separated by a space, this connects them
  # back to the word (eg. could n't vs couldn't)
  for i in contract_lst:
    s = re.sub('\s' + i, i,s)
  
  # replace contractions
  s = contractions.fix(s)

  # remove empty quotes
  s = re.sub('\'\'','',s)

  # remove possessive "s"
  s = re.sub(' \'s','',s)

  # remove lrb and rrb references
  pattern = r'(-LRB-|-RRB-)+'
  s = re.sub(pattern,'',s)

  # remove ndash
  s = re.sub('\sndash\s','',s)

  # fix issues with incorrect encoding
  s = ftfy.fix_text(s)
  
  # remove punctuation and symbols
  s = re.sub('[$,.!?;:%°#@|&\/\\\]*','',s)

  # remove numbers 
  s = re.sub('[0-9]*','',s)


  # positive lookbehind for cases where dashes proceeded by spaces or
  # other dashes (not suggesting a hyphenated word or name)
  s = re.sub('(?<=[ -])-','',s)

  # believe  â is encoding for -  see it a lot when referencing spans of years
  s= re.sub('â','')





  return s
  # return simple_preprocess(s)



In [19]:
# Example of a challenging Text to understand. Need to consider how we should proceed with this one.
text = 'SOS -LRB- Â Â Â â '' â '' â '' Â Â Â -RRB- is a Morse code . It is used as distress code , to signal danger .'
fixed = text_processing(text)
fixed

'SO IS  Â Â Â â  â  â  Â Â Â  is a Morse code  It is used as distress code  to signal danger '

In [20]:
# %%timeit

#run text_processing on the training corpus 
train_df['processed_text'] = train_df['original_text'].apply(text_processing)

In [23]:
# Comparing original text to the processed
train_df[['original_text','processed_text']].sample(20).style.set_properties(subset=['original_text'], 
                                                    **{'width': '500px'})

Unnamed: 0,original_text,processed_text
249344,"Ciceros first Oration against Catiline is sometimes used in type specimens : Quo usque tandem abutere , Catilina , patientia nostra ?",Ciceros first Oration against Catiline is sometimes used in type specimens Quo usque tandem abutere Catilina patientia nostra
207236,"He remained a bachelor , but not a stranger to love , which he counted the sixth sense : his inscription of the Physiognomie to his beautiful cousin Juliette RÃ camier reads '' Madam , receive kindly and read indulgently the work of an old man .",He remained a bachelor but not a stranger to love which he counted the sixth sense his inscription of the Physiognomie to his beautiful cousin Juliette RÃ camier reads Madam receive kindly and read indulgently the work of an old man
22763,His wife 's name is given as Mama Anawarkhi or Coya Anahurque .,His wife name is given as Mama Anawarkhi or Coya Anahurque
229640,Shamozai is a village and Union Council of Mardan District in the North-West Frontier Province of Pakistan .,Shamozai is a village and Union Council of Mardan District in the North-West Frontier Province of Pakistan
323347,"Some things the newcomer does may seem '' wrong '' , but it may improve Wikipedia .",Some things the newcomer does may seem wrong but it may improve Wikipedia
385479,"The Department of Defense is made up of the Department of the Army , the Department of the Navy , the Department of the Air Force , the National Security Agency , and the Defense Intelligence Agency .",The Department of Defense is made up of the Department of the Army the Department of the Navy the Department of the Air Force the National Security Agency and the Defense Intelligence Agency
224489,"Less than a hundred years later , Giorgio Vasari saw this movement as a '' golden age '' . Giogio Vasari expressed this thought at the head of his Vita of Botticelli .",Less than a hundred years later Giorgio Vasari saw this movement as a golden age Giogio Vasari expressed this thought at the head of his Vita of Botticelli
60831,"This initial success was bettered in the following year with the club winning the French Championship for the first time in its history , qualifying for the European Cup .",This initial success was bettered in the following year with the club winning the French Championship for the first time in its history qualifying for the European Cup
126509,"Kari is about 7 kilometres in diameter , and orbits Saturn at an average distance of 22,305.1 Mm in 1243.71 days , at an inclination of 148.4 ° to the ecliptic -LRB- 151.5 ° to Saturn 's equator -RRB- , in a retrograde direction and with an eccentricity of 0.3405 .",Kari is about kilometres in diameter and orbits Saturn at an average distance of Mm in days at an inclination of ° to the ecliptic ° to Saturn equator in a retrograde direction and with an eccentricity of
99706,Mirza Ghulam Ahmad founded the movement on 23rd March 1889 and termed it the Ahmadiyya Muslim Jama'at -LRB- community -RRB- envisioning it to be a revitalisation of Islam .,Mirza Ghulam Ahmad founded the movement on rd March and termed it the Ahmadiyya Muslim Jama'at community envisioning it to be a revitalisation of Islam


In [24]:
train_df[train_df['original_text'].str.contains('Ã')]

Unnamed: 0,original_text,label,processed_text
32,Simon Boccanegra is an opera with a prologue a...,1,Simon Boccanegra is an opera with a prologue a...
46,"Sergio PÃ rez Mendoza -LRB- born January 26 , ...",1,Sergio PÃ rez Mendoza born January in Guad...
51,AmbÃ rieux-en-Dombes is a commune in the depar...,1,Ambà rieux-en-Dombes is a commune in the depar...
78,BÃ thonsart is a commune in the Pas-de-Calais ...,1,BÃ thonsart is a commune in the Pas-de-Calais ...
238,"Here DvoÅ Ã k met with Harry Burleigh , his pu...",1,Here DvoÅ Ã k met with Harry Burleigh his pup...
...,...,...,...
416655,A BÃ 1\/4 chner flask -LRB- also called a vacu...,0,A BÃ chner flask also called a vacuum flask ...
416663,"In 1981 Mitterrand defeated the conservative ,...",0,In Mitterrand defeated the conservative Valà...
416697,"Because it includes many suburbs of Oslo , not...",0,Because it includes many suburbs of Oslo nota...
416717,It is found in the region Provence-Alpes-C Ã t...,0,It is found in the region Provence-Alpes-C à t...
