<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/dave-updates/code/dave_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd

In [11]:
train_url = 'https://raw.githubusercontent.com/Tstrebe2/predicting-text-difficulty/main/assets/WikiLarge_Train.csv'

In [12]:
train_df = pd.read_csv(train_url)

In [33]:
train_df.iloc[4].values

array(['Geneva -LRB- , ; , ; , ; ; -RRB- is the second-most-populous city in Switzerland -LRB- after Zürich -RRB- and is the most populous city of Romandie -LRB- the French-speaking part of Switzerland -RRB- .',
       1,
       'Geneva  , ; , ; , ; ;  is the second-most-populous city in Switzerland  after Zürich  and is the most populous city of Romandie  the French-speaking part of Switzerland  .'],
      dtype=object)

In [17]:
train_df.sample(20)[['original_text', 
                     'label']].style.set_properties(subset=['original_text'], 
                                                    **{'width': '500px'})

Unnamed: 0,original_text,label
275869,"In 1991 , they started in Boston , Massachusetts .",0
238423,"Korkeasaari is an island in Helsinki , Finland .",0
279771,The Republic of Namibia is a country in southern Africa on the Atlantic coast .,0
360192,"Bryan Adams -LRB- November 9 , 1959 -RRB- is a Canadian rock singer .",0
159516,They soon split into cat-like and dog-like forms -LRB- Feliformia and Caniformia -RRB- .,1
227805,Other websites,0
278723,"So , Hyderabad came into the Telugu speaking community and thus became the capital of Andhra Pradesh .",0
205468,"The club 's home is the 22,250 capacity Headingley Stadium , which is in the suburb of Headingley , northwest Leeds , where they have played since 1890 .",1
130239,"Gushank , p. 251 ; Varley , Paul .",1
77262,"England , Northern Ireland , Scotland and Wales have discrete systems of education .",1


In [None]:
# Observations of things that need cleaning:
# join " 's " with their associated words; same with contractions  - done
# Some sentences are partials of other sentences within the corpus
# Need to address accents - not sure if I do
# Need to address punctuation (vectorizers) - handled through simple_preprocess
# Address weird quotations - (example at index 11005) '' yogurt ' ''
# Need to address -LRB and -RRB which are lemma references to left 
# and right parentheses - done
# remove 'â' (misformatting) - this is incorrect encoding. use ftfy package
# 


# Observations
  # difficulty can be a combination of hard words, hard to pronounce or unfamiliar
  # names, long-run on sentences, harder topics (eg Linux kernel or
  # referring to dog breeds as an example) or just non-sensical sentences without
  # context




In [2]:
# Found a solution to help with contractions
!pip install contractions
!pip install gensim
!pip install ftfy
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 6.9 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 56.7 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting f

In [35]:
import re
import contractions
from gensim.utils import simple_preprocess
import ftfy
from unidecode import unidecode

def text_processing(s):
  contract_lst = {'\'ve': '\'ve',
                ' \'ll': '\'ll',
                ' \'d': '\'d',
                ' n\'t': 'n\'t'}
  for key,value in contract_lst.items():
    # remove spaces between contractions
    s = re.sub(key, value,s)
  
  # replace contractions
  s = contractions.fix(s)

  # remove empty quotes
  s = re.sub('\'\'','',s)

  # remove possessive "s"
  s = re.sub(' \'s','',s)


  # remove lrb and rrb references
  pattern = r'(-LRB-|-RRB-)+'
  s = re.sub(pattern,'',s)


  # remove ndash
  s = re.sub('\sndash\s','',s)

  # fix issues with incorrect encoding
  s = ftfy.fix_text(s)
  
  # remove punctuation
  s = re.sub('[,.!?;]+','',s)

  return s
  # return simple_preprocess(s)



In [36]:
text = 'Geneva  , ; , ; , ; ;  is the second-most-populous city in Switzerland  after Zürich  and is the most populous city of Romandie  the French-speaking part of Switzerland  .'
# text = 'He then married Ditta PÃ sztory , a piano student \'s friend .'
fixed = text_processing(text)
fixed

'Geneva          is the second-most-populous city in Switzerland  after Zürich  and is the most populous city of Romandie  the French-speaking part of Switzerland  '

In [None]:
train_df[train_df['original_text'].str.contains('1\/4')]['original_text'].values

array(['Because of this , Stephen Euin Cobb is somewhere between 1/8 and 1/4 Native American .',
       'Coins Between 1832 and 1834 , copper 1 , 2 , 5 and 10 centime , silver 1/4 , 1/2 , 1 , 2 and 5 franc , and gold 20 and 40 franc coins were introduced .',
       '461/463 -- 476 -RRB- , more known by his nickname Romulus Augustulus -LRB- Little Augustus -RRB- , was the last Western Roman Emperor reigning from the 31 October 475 until his deposition on the 4 September 476 .',
       'The ball is about 3\xa01/4 inches -LRB- 8.3 centimetres -RRB- in diameter and weighs about four ounces -LRB- 113.4 grams -RRB- .',
       'His father , Paul Joseph James Martin , a Franco-Ontarian of 1/4 Irish and 3/4 French descent , served thirty-three years as a member of the Canadian House of Commons , and was a Cabinet minister in four Liberal governments .',
       'Assuming an albedo similar to that of the primary , the magnitude suggests a quite substantial diameter ; perhaps ~ 250 km or about 1/4

In [14]:
train_df['processed_text'] = train_df['original_text'].apply(text_processing)

In [16]:
train_df.head()

Unnamed: 0,original_text,label,formatted_text
0,There is manuscript evidence that Austen conti...,1,There is manuscript evidence that Austen conti...
1,"In a remarkable comparative analysis , Mandaea...",1,"In a remarkable comparative analysis , Mandaea..."
2,"Before Persephone was released to Hermes , who...",1,"Before Persephone was released to Hermes , who..."
3,Cogeneration plants are commonly found in dist...,1,Cogeneration plants are commonly found in dist...
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"Geneva , ; , ; , ; ; is the second-most-popu..."
