# Textual pre-processing for ML/Statistical Analisys

In [0]:
import re
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

In [0]:
# The state functions below performs regex action and choose next state. 
# Input URL-web address
# Output preprocessed list of sentences
class preprocessFSM:
    def __init__(self, sentence = 0):
        self.srt_list = []
        self.srt_list_start = []
        self.curstate = 1
        self.endStates = 0
        self.pattern_phone = re.compile(r'''(\d{1,3}[-\.\s]?\d{1,3}[-\.\s]
                                        ??\d{1,3}[-\.\s]??\d{4}|\(\d{3}\)\s*
                                        \d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]
                                        ??\d{4})''', re.X)
        self.pattern_email = re.compile(r'\w+\.?\w+@\w+\.?\w+')
        self.pattern_digit = re.compile(r'[\d]+')
        self.pattern_del = re.compile(r'(\(function\()|(div class=)')
        self.pattern_sent1 = re.compile(r'^[A-Z].+\b[\.!\?]$',re.M)
        self.pattern_sent =  re.compile(r'^[A-Z0-9\(\[\{].+$',re.M)
        self.sentence = sentence;
        
    # launch state machine     
    def run(self, url):
        try:
          url_open = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        except:
          print('Invalid url read: %r' % url)
          return []
                             
        cont = urlopen(url_open).read()
        soup = BeautifulSoup(cont, 'lxml')
        if self.sentence == 0:
            srt_list = list(soup.stripped_strings)
        else:
            srt_list = soup.get_text().split('\n')                 
        
        if not len(srt_list):
            print('Can`t get any content: %r' % url)
            return []
        self.srt_list = srt_list[:]  
        self.srt_list_start = srt_list[:]
        
        for i in range(0, len(self.srt_list)):
            self.state0(i)
            self.curstate = 1
            
        # remove empty values      
        return [x for x in self.srt_list if x] 
   
    def get_state_pattern(self):
      if self.curstate == 2:
          return self.pattern_phone, '[phone]'
      elif self.curstate == 3:
          return self.pattern_email, '[email]'
      else:
          return self.pattern_digit, '[digits]' 

    def state0(self, index):
    # del redundant string-html
        self.srt_list[index] = self.srt_list[index].strip()
        red_str = self.pattern_del.match(self.srt_list[index])
        if red_str:
            self.srt_list[index] = ""
            return self.endStates
        else:
            if self.sentence == 0:
                sent_str = self.pattern_sent.match(self.srt_list[index])
                # sentence always starts with A-Z 0-9 ( [ { 
                if sent_str == None:
                    self.srt_list[index] = ""
                    return self.endStates
            else:
                sent_str = self.pattern_sent1.match(self.srt_list[index])
                # sentence always starts with capital symbol
                # sentence finishes with one of this symbols . ! ?
                # sentense contains only: "0-9 A-Z a-z , @#$%^&*()[]{}
                if sent_str == None:
                    self.srt_list[index] = ""
                    return self.endStates
            self.curstate += 1
            self.state(index)
    
    # apply pattern and replace line  
    def state(self, index):
        pattern, repl_val = self.get_state_pattern()
        self.srt_list[index] = re.sub(pattern,repl_val, 
                                      self.srt_list[index])
        if self.curstate == 4:
          self.curstate = self.endStates 
        else: 
          self.curstate += 1
          self.state(index)   

###I took the contact page 'https://www.tesla.com/contact' to try using tag-replace because common page-site don't include phone numbers or e-mail addresses. 
I applied the requirements:
[sentence always starts with A-Z 0-9 ( [ ] 

to avoid redundant HTML-strings.


In [0]:
m = preprocessFSM()

In [0]:
proc_lines = m.run('https://www.tesla.com/contact')

### resulting strings

In [0]:
proc_lines

['Contact Us | Tesla',
 'Tesla, Inc',
 'Model S',
 'Model X',
 'Model [digits]',
 'Used Inventory',
 'Visit a Store',
 'Roadster',
 'Energy',
 'Shop',
 'Shop',
 'Tesla Account',
 'Sign In',
 'More',
 'Used Inventory',
 'Find Us',
 'Support',
 'Get Newsletter',
 'News',
 'Roadster',
 'New Inventory',
 'Charging',
 'Semi',
 'Carbon Impact',
 'Sign In',
 'Tesla Account',
 'Log Out',
 'Contact',
 'About',
 'Careers',
 'Contact',
 'Legal',
 'Investors',
 'Suppliers',
 'Sales',
 'Tesla Motors',
 'Toll free',
 '([digits]) [digits]-TESLA',
 '[phone]',
 'Local',
 '[phone]',
 'Fax',
 '[phone]',
 'Customer Support & Roadside Assistance',
 'Tesla',
 'Customer Support',
 '[email]',
 'Emergency Roadside Assistance',
 '([digits])',
 '[phone]',
 'Phone numbers for other countries',
 'Safety recall information',
 'Careers',
 'Visit our',
 'First Responders',
 'Visit our',
 'Press',
 'North America',
 '[email]',
 'Europe & Middle East',
 '[email]',
 'Australia and Asia',
 '[email]',
 'China',
 'China-[e

### I applied simple regex to check some result in short form

In [0]:
pattern = re.compile(r'.+[\d]+.+|.+@.+')
raw_list = m.srt_list_start
for i in range(0, len(raw_list)):
  line = raw_list[i]
  group = pattern.match(line)
  if group:
     print(group.group())
  

.async-hide { opacity: 0 !important}
.i18n-ja_JP .hide-for-i18n-ja_JP,
(888) 51-TESLA
(888) 518-3752
(650) 681-5100
(650) 681-5101
CustomerSupport@tesla.com
(877)
798-3752
Press@tesla.com
EUPress@tesla.com
APACPress@tesla.com
China-Press@tesla.com
3500 Deer Creek Road
94304
45500 Fremont Boulevard
94538
89434
Burgemeester Stramanweg 122
1101 EN
Asteriastraat 1-7
5047 RM
Rudolf-Diesel-Strasse 14
54595 Prüm
33 Herbert St
St Leonards, NSW 2065
8F, Tower 3 China Central Place
No.77 Jianguo Road
Units A&C, 27/F
Eastern Aoyama Bldg 4F
8-5-41 Akasaka
Yeongdong-daero 730
No. 6, Lane 11, Section 6, Minquan East Road
Neihu District, Taipei City 114
Tesla © 2019


In [0]:
pattern = re.compile(r'(\[digits\]+|\[phone\]+|\[email\]+)')
for i in range(0, len(proc_lines)):
  line = proc_lines[i]
  group = pattern.match(line)
  if group:
     print(group.group())

[phone]
[phone]
[phone]
[email]
[phone]
[email]
[email]
[email]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]
[digits]


### to try using following requirements:

*   sentence always starts with capital symbol
*   sentence finishes with one of this symbols . ! ?
*   sentense contains only: "0-9 A-Z a-z , @#$%^&*()[]{}

I took suggested link 'https://en.wikipedia.org/wiki/Polymorphism_(computer_science)'

parameter "1" - to switch matching of sentences to apply requirements mentioned above for a whole sentence

In [0]:
m = preprocessFSM(1)
proc_lines = m.run('https://en.wikipedia.org/wiki/Polymorphism_(computer_science)')

In [0]:
proc_lines

['Not to be confused with Polymorphic code.',
 'Ad hoc polymorphism: defines a common interface for an arbitrary set of individually specified types.',
 'Parametric polymorphism: when one or more types are not specified by name but by abstract symbols that can represent any type.',
 'Interest in polymorphic type systems developed significantly in the [digits]s, with practical implementations beginning to appear by the end of the decade.  Ad hoc polymorphism and parametric polymorphism were originally described in Christopher Strachey\'s Fundamental Concepts in Programming Languages[[digits]], where they are listed as "the two main classes" of polymorphism.  Ad hoc polymorphism was a feature of Algol [digits], while parametric polymorphism was the core feature of ML\'s type system.',
 'In a [digits] paper, Peter Wegner and Luca Cardelli introduced the term inclusion polymorphism to model subtypes and inheritance,[[digits]] citing Simula as the first programming language to implement it.

### The regular expression is a highly helpful tool to pre-process text data. In this project, I have obtained some experience to try it and sure up with the previous statement. I experienced as well to write my first Python-class to get deep into the language.