In [14]:
import pandas as pd
from collections import Counter, OrderedDict
from operator import itemgetter

# Part I - Regular Expressions

In [2]:
df = pd.read_csv("faculty.csv")
df[:3]

Unnamed: 0,name,degree,title,email
0,Scarlett L. Bellamy,Sc.D.,Associate Professor of Biostatistics,bellamys@mail.med.upenn.edu
1,Warren B. Bilker,Ph.D.,Professor of Biostatistics,warren@upenn.edu
2,Matthew W Bryan,PhD,Assistant Professor of Biostatistics,bryanma@upenn.edu


## Q1. Find how many different degrees there are, and their frequencies: Ex: PhD, ScD, MD, MPH, BSEd, MS, JD, etc.

In [3]:
df['Degree'] = df[' degree'].str.replace('.','').str.lower()
df[:3]

Unnamed: 0,name,degree,title,email,Degree
0,Scarlett L. Bellamy,Sc.D.,Associate Professor of Biostatistics,bellamys@mail.med.upenn.edu,scd
1,Warren B. Bilker,Ph.D.,Professor of Biostatistics,warren@upenn.edu,phd
2,Matthew W Bryan,PhD,Assistant Professor of Biostatistics,bryanma@upenn.edu,phd


In [4]:
list_degrees = list(df['Degree'])
degrees = [item.split() for item in list_degrees]
deg = [degree for item in degrees for degree in item]

dict_degrees  = dict(Counter(deg))
dict_degrees

{'0': 1,
 'bsed': 1,
 'jd': 1,
 'ma': 1,
 'md': 1,
 'mph': 2,
 'ms': 2,
 'phd': 31,
 'scd': 6}

## Q2. Find how many different titles there are, and their frequencies: Ex: Assistant Professor, Professor

In [5]:
df[' title'].value_counts()

Professor of Biostatistics              13
Assistant Professor of Biostatistics    12
Associate Professor of Biostatistics    12
Name:  title, dtype: int64

## Q3. Search for email addresses and put them in a list. Print the list of email addresses.

In [6]:
list_emails = list(df[' email'])
list_emails[:3]

['bellamys@mail.med.upenn.edu', 'warren@upenn.edu', 'bryanma@upenn.edu']

## Q4. Find how many different email domains there are (Ex: mail.med.upenn.edu, upenn.edu, email.chop.edu, etc.). Print the list of unique email domains.

In [7]:
domains = [email[email.find('@')+1:] for email in list_emails]
dict(Counter(domains))

{'cceb.med.upenn.edu': 1,
 'email.chop.edu': 1,
 'mail.med.upenn.edu': 23,
 'upenn.edu': 12}

# Part II - Write to CSV File

## Q5. Write email addresses from Part I to csv file

In [8]:
df[' email'].to_csv('emails.csv', index = False)

# Part III - Dictionary

## Q6. Create a dictionary in the below format:
```
faculty_dict = { 'Ellenberg': [\
              ['Ph.D.', 'Professor', 'sellenbe@upenn.edu'],\
              ['Ph.D.', 'Professor', 'jellenbe@mail.med.upenn.edu']
                            ],
              'Li': [\
              ['Ph.D.', 'Assistant Professor', 'liy3@email.chop.edu'],\
              ['Ph.D.', 'Associate Professor', 'mingyao@mail.med.upenn.edu'],\
              ['Ph.D.', 'Professor', 'hongzhe@upenn.edu']
                            ]
            }
```

In [9]:
df = pd.read_csv('faculty.csv')
df_list = list(map(list, df.values))

In [10]:
for person in df_list:
    title = person[2].find("Professor") + 9
    person[2] = person[2][:title]
    person[0] = person[0].split()

In [11]:
faculty_dict = {}
for person in df_list:
    last = person[0][len(person[0])-1]
    if last not in faculty_dict:
        faculty_dict[last] = person[1:]
    elif len(faculty_dict[last]) == 3:
        first, faculty_dict[last] = faculty_dict[last], []
        faculty_dict[last].append(first)
        faculty_dict[last].append(person[1:])
    else:
        faculty_dict[last].append(person[1:])

list(faculty_dict.items())[:3]

[('Xie',
  [[' Ph.D.', 'Associate Professor', 'sxie@mail.med.upenn.edu'],
   [' PhD', 'Assistant Professor', 'dxie@upenn.edu']]),
 ('Roy', [' Ph.D.', 'Associate Professor', 'jaroy@mail.med.upenn.edu']),
 ('Joffe', [' MD MPH Ph.D', 'Professor', 'mjoffe@mail.med.upenn.edu'])]

## Q7. The previous dictionary does not have the best design for keys. Create a new dictionary with keys as:

```
professor_dict = {('Susan', 'Ellenberg'): ['Ph.D.', 'Professor', 'sellenbe@upenn.edu'],\
                ('Jonas', 'Ellenberg'): ['Ph.D.', 'Professor', 'jellenbe@mail.med.upenn.edu'],\
                ('Yimei', 'Li'): ['Ph.D.', 'Assistant Professor', 'liy3@email.chop.edu'],\
                ('Mingyao','Li'): ['Ph.D.', 'Associate Professor', 'mingyao@mail.med.upenn.edu'],\
                ('Hongzhe','Li'): ['Ph.D.', 'Professor', 'hongzhe@upenn.edu']
            }
```

In [12]:
professor_dict = {(" ".join(person[0][:len(person[0])-1]), person[0][-1]): person[1:] for person in df_list}
list(professor_dict.items())[:3]

[(('Wei-Ting', 'Hwang'),
  [' Ph.D.', 'Associate Professor', 'whwang@mail.med.upenn.edu']),
 (('Dawei', 'Xie'), [' PhD', 'Assistant Professor', 'dxie@upenn.edu']),
 (('Mingyao', 'Li'),
  [' Ph.D.', 'Associate Professor', 'mingyao@mail.med.upenn.edu'])]

## Q8. It looks like the current dictionary is printing by first name. Sort by last name and print the first 3 key and value pairs.

In [29]:
sorted(professor_dict.items(), key = lambda name: name[0][1])[:3]

[(('Scarlett L.', 'Bellamy'),
  [' Sc.D.', 'Associate Professor', 'bellamys@mail.med.upenn.edu']),
 (('Warren B.', 'Bilker'), ['Ph.D.', 'Professor', 'warren@upenn.edu']),
 (('Matthew W', 'Bryan'),
  [' PhD', 'Assistant Professor', 'bryanma@upenn.edu'])]