## Processing Input

In [1]:
import re
import phonenumbers

### Regex Expressions
Based on prompt, there are only 3 valid entry formats that are acceptable

Create regex expressions that match the following:
```
Lastname, Firstname, (703)-742-0996, Blue, 10013 
Firstname Lastname, Red, 11237, 703 955 0373 
Firstname, Lastname, 10013, 646 111 0101, Green
```

In [2]:
valid_one = re.compile((
    r'(?P<last>[A-z]+),\s(?P<first>[A-z. ]+),\s'
    r'(?P<phone>\([0-9]{3}\)-[0-9]{3}-[0-9]{4}),\s'
    r'(?P<color>[A-z ]+),\s(?P<zip>[0-9]{5})'
))
valid_two = re.compile((
    r'(?P<first>[A-z. ]+)\s(?P<last>[A-z]+),\s'
    r'(?P<color>[A-z ]+),\s(?P<zip>[0-9]{5}),\s'
    r'(?P<phone>[0-9]{3}\s[0-9]{3}\s[0-9]{4})'
))
valid_three = re.compile((
    r'(?P<first>[A-z. ]+),\s(?P<last>[A-z]+),\s'
    r'(?P<zip>[0-9]{5}),\s(?P<phone>[0-9]{3}\s[0-9]{3}\s[0-9]{4}),\s'
    r'(?P<color>[A-z ]+)'
))

### Read input file
A data file (`data.in`) was supplied that has 64 entries. Input is a list of string items.

In [3]:
with open("data.in") as input_file:
    entries = input_file.read().splitlines() # Makes sure new lines are striped

### Process Entries
For every entry in the list, check to see if the entry matches of the the regex validators.
If the entry matches, parse the string into a dictionary for further processing.
If not, add the index to a list that tracks which entries are invalid.

In [4]:
valid_entries = []
errors = []

for i, entry in enumerate(entries):
    # Check to see if entry is valid 
    if valid_one.match(entry):
        entry_match = valid_one.match(entry)
    elif valid_two.match(entry):
        entry_match = valid_two.match(entry)
    elif valid_three.match(entry):
        entry_match = valid_three.match(entry)
    else:
        errors.append(i)
        continue

    phone = phonenumbers.parse(entry_match.group("phone"), "US")
    
    
    entry_dict = {
        "first_name": entry_match.group("first"),
        "last_name": entry_match.group("last"),
        "phone_number": phonenumbers.format_number(phone, "{}-{}-{}"),
        "color": entry_match.group("color"),
        "zipcode": entry_match.group("zip")
    }

    valid_entries.append(entry_dict)

Debugging to ensure processing went properly. 17 of the entries appears to be an invalid format based on the 3 examples.

In [5]:
print "Number of entries:", len(entries)
print "Number of valid entries:", len(valid_entries)
print "Number of Erroneous entries:", len(errors)
print "Erroneous entries:", errors

Number of entries: 64
Number of valid entries: 47
Number of Erroneous entries: 17
Erroneous entries: [0, 5, 7, 15, 16, 29, 37, 38, 41, 43, 44, 45, 47, 48, 56, 57, 62]


## Serialize Data

In [6]:
from marshmallow import Schema, fields, post_dump, pprint

Create schemata that will make sure inputs are the correct type

In [7]:
class EntrySchema(Schema):
    """Schema for rolodex entry"""
    first_name = fields.String(required=True, dump_to="firstname")
    last_name = fields.String(required=True, dump_to="lastname")
    phone_number = fields.String(required=True, dump_to="phonenumber")
    zipcode = fields.String(required=True)
    color = fields.String(required=True)

In [8]:
class OutputSchema(Schema):
    """Schema for output"""
    entries = fields.Nested(EntrySchema, many=True, required=True, default=[]) # Items in list match the EntrySchema
    errors = fields.List(fields.Integer(), default=[])

In [9]:
output_schema = OutputSchema()

In [10]:
data = {"entries": valid_entries, "errors": errors}

In [11]:
output_dict = output_schema.dump(data).data
pprint(output_dict)

{u'entries': [{u'color': u'aqua marine',
               u'firstname': u'Ria',
               u'lastname': u'Tillotson',
               u'phonenumber': u'196-910-5548',
               u'zipcode': u'97671'},
              {u'color': u'blue',
               u'firstname': u'Annalee',
               u'lastname': u'Loftis',
               u'phonenumber': u'905-329-2054',
               u'zipcode': u'97296'},
              {u'color': u'gray',
               u'firstname': u'James',
               u'lastname': u'Johnston',
               u'phonenumber': u'628-102-3672',
               u'zipcode': u'38410'},
              {u'color': u'yellow',
               u'firstname': u'Quinton',
               u'lastname': u'Liptak',
               u'phonenumber': u'653-889-7235',
               u'zipcode': u'70703'},
              {u'color': u'aqua marine',
               u'firstname': u'George',
               u'lastname': u'Won',
               u'phonenumber': u'488-084-5794',
               u'zipcode': 

In [12]:
output = output_schema.dumps(data, sort_keys=True, indent=4).data
print output

{
    "entries": [
        {
            "color": "aqua marine", 
            "firstname": "Ria", 
            "lastname": "Tillotson", 
            "phonenumber": "196-910-5548", 
            "zipcode": "97671"
        }, 
        {
            "color": "blue", 
            "firstname": "Annalee", 
            "lastname": "Loftis", 
            "phonenumber": "905-329-2054", 
            "zipcode": "97296"
        }, 
        {
            "color": "gray", 
            "firstname": "James", 
            "lastname": "Johnston", 
            "phonenumber": "628-102-3672", 
            "zipcode": "38410"
        }, 
        {
            "color": "yellow", 
            "firstname": "Quinton", 
            "lastname": "Liptak", 
            "phonenumber": "653-889-7235", 
            "zipcode": "70703"
        }, 
        {
            "color": "aqua marine", 
            "firstname": "George", 
            "lastname": "Won", 
            "phonenumber": "488-084-5794", 
            "zipco