In [149]:
# Check if running correct version

import sys
if sys.version_info.major != 3:
    raise ValueError("You must use Python 3.")
if sys.version_info.minor < 4 :
    raise ValueError("You must use at least Python 3.4")
if sys.version_info.minor < 6:
    print("Recommended Python Version is 3.6")

## 1.1 Phone Numbers

The German country code is always _+49_ or _0049_. If a country code is given, the next number cannot be a _0_. If this number starts with a _1_, it is a mobile phone. In this case, the next three numbers (with the _1_) are the area code. The remaining numbers are the number.
If it is not a mobile number, just use the remaining numbers. (Meaning: No area code detection, if not a mobile number)

In [150]:
import re
import nltk
test_phone_numbers = [
    "+49174321324",
    "0164883423",
    "0049(0)16483311724",
    "+49-8332-3010",
    "00498513994",
    "0164 5554454",
    "0851 509",
    "0851 509-0",
    "(0851) 3394"
]

In [151]:
gold_phone_numbers = [
    "+49 174 321324",
    "+49 164 883423",
    "+49 164 83311724",
    "+49 83323010",
    "+49 8513994",
    "+49 164 5554454",
    "+49 851509",
    "+49 8515090",
    "+49 8513394"
]

In [152]:
def validate_phone(parsed_numbers):
    for parsed, test, gold in zip(parsed_numbers, test_phone_numbers, gold_phone_numbers):
        if parsed != gold:
            print("WRONG   → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))
        else:
            print("CORRECT → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))

In [153]:
parsed_numbers_correct=[]
for i in test_phone_numbers:
    pattern = r"(\(*\s*\+49\)*\s*)\-?\.?\s?(\()?(1)(\d{2})(\))?(\.)?(\-)?(\s)?\(?(\d{3,6})\)?|((00490.{3,12})|\(?(0049))\)?\s?\.?\-?(\(?\.?\-?(0)?\.?\-?\)?)?(\(?([1]\d{2})\)?\s?\.?\-?|(\d{3}))?\(?(\d{4,11})?\)?\s?\.?\-?|\(?([0])\)?\s?\.?\-?\(?([1])(\d{2})\)?\.?\-?\s?\(?(\d{3,7})\)?|([0])([1])(\d{2})(\s)(\d{3,11})|\(?(\+49)\)?(\-)?(\.)?(\(?)(\s)?([1-9]\d{3})(\))?(-)?(\.)?(\s)?(\d{2,9})|([0])(\d{2,3})(\s)?(-)?(\.)?(\()?(\d{3,9})(\))?(-?\.?\(?([0])?\)?)|((\(?[0])(\d{3})(\)?\-?))(\s)?(\-)?(\d{4})"
    pattern_gap = r"\s{2,8}"
    parsed_numbers = re.sub(pattern, "+49 "+" "+r"\3"+r"\4"+" "+r"\9"+r"\20"+r"\21"+" "+r"\22"+r"\16"+" "+r"\17"+r"\18"+r"\33"+r"\38"+r"\24"+r"\25"+" "+r"\27"+r"\40"+r"\45"+r"\48"+r"\51"+r"\55", i)
    parsed_numbers_final = re.sub(pattern_gap, " ", parsed_numbers)
    parsed_numbers_final1=re.sub(r"\s$","",parsed_numbers_final)
    parsed_numbers_correct.append(parsed_numbers_final1)
validate_phone(parsed_numbers_correct)



CORRECT → 
	 Input : +49174321324
	 Gold  : +49 174 321324
	 Parsed: +49 174 321324
CORRECT → 
	 Input : 0164883423
	 Gold  : +49 164 883423
	 Parsed: +49 164 883423
CORRECT → 
	 Input : 0049(0)16483311724
	 Gold  : +49 164 83311724
	 Parsed: +49 164 83311724
CORRECT → 
	 Input : +49-8332-3010
	 Gold  : +49 83323010
	 Parsed: +49 83323010
CORRECT → 
	 Input : 00498513994
	 Gold  : +49 8513994
	 Parsed: +49 8513994
CORRECT → 
	 Input : 0164 5554454
	 Gold  : +49 164 5554454
	 Parsed: +49 164 5554454
CORRECT → 
	 Input : 0851 509
	 Gold  : +49 851509
	 Parsed: +49 851509
CORRECT → 
	 Input : 0851 509-0
	 Gold  : +49 8515090
	 Parsed: +49 8515090
CORRECT → 
	 Input : (0851) 3394
	 Gold  : +49 8513394
	 Parsed: +49 8513394


## 1.2 Email Addresses

In [154]:
test_emails = [
    "peter.mueller@uni-passau.de",
    "peter dot mueller at uni-passau dot de",
    "peter.mueller(at)uni-passau.de",
    "peter.mueller (at) uni-passau.de",
    "nefullword@gw.uni-passau.de",
    "peter (dot) mueller (at) uni-passau (dot) de",
    "other.host@gmail.com",
    "special-chars (at) live.com"
]

In [155]:
gold_emails = [
    "peter.mueller@uni-passau.de",
    "peter.mueller@uni-passau.de",
    "peter.mueller@uni-passau.de",
    "peter.mueller@uni-passau.de",
    "nefullword@gw.uni-passau.de",
    "peter.mueller@uni-passau.de",
    "other.host@gmail.com",
    "special-chars@live.com"
]

In [156]:
def validate_email(parsed_emails):
    for parsed, test, gold in zip(parsed_emails, test_emails, gold_emails):
        if parsed != gold:
            print("WRONG   → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))
        else:
            print("CORRECT → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))

In [157]:
emailList=[]
pattern=r"([a-zA-Z0-9_+-]+)(((\.)|(\s*(\(*(dot)\)*)\s*))*((\()?([a-zA-Z0-9\s_.+-])*(\))?)*?)((@)|(\s*(\(*(at)\)*)\s*))([a-zA-Z0-9-]+)((\.)|(\s*(\(*(dot)\)*)\s*))([a-zA-Z0-9-.]+)"
for i in test_emails:
    email1=(re.sub(pattern,r"\1"+r"\2"+"@"+r"\17"+"."+r"\23",i, flags=re.IGNORECASE))
    email11=(re.sub(r"(((\s\()|(\()|(\s))(dot)((\)\s)|(\))|(\s)))",".",email1, flags=re.IGNORECASE))
    emailList.append(re.sub(r"\s?\(?\)?","",email11, flags=re.IGNORECASE))
validate_email(emailList)


CORRECT → 
	 Input : peter.mueller@uni-passau.de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : peter dot mueller at uni-passau dot de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : peter.mueller(at)uni-passau.de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : peter.mueller (at) uni-passau.de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : nefullword@gw.uni-passau.de
	 Gold  : nefullword@gw.uni-passau.de
	 Parsed: nefullword@gw.uni-passau.de
CORRECT → 
	 Input : peter (dot) mueller (at) uni-passau (dot) de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : other.host@gmail.com
	 Gold  : other.host@gmail.com
	 Parsed: other.host@gmail.com
CORRECT → 
	 Input : special-chars (at) live.com
	 Gold  : special-chars@live.com
	 Parsed: special-chars@live