In [16]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

### 2c) Extract the Company Name From the Email 

Create a function with the following specifications:
- Function Name: extract_company
- Purpose: to extract the company of the email (i.e., everything after the @ sign but before the .)
- Parameter(s): email (string)
- Returns: The extracted part of the email (string)
- Hint: This should take 1 line of code. Look into the find('') method. 

You can start with this outline:
```python 
def extract_company(email):
    return
```

Example Usage: 
- extract_company("larhe@uber.com") should return "uber"
- extract_company(“ds@cogs.edu”) should return “cogs”



In [None]:
# YOUR CODE HERE
raise NotImplementedError()

### 2d) Load in employee data 

Load the json file into a pandas dataframe. Call it `df_employee`.

In [19]:
df_employee = ...

Demonstrate multiple predicates for `.loc`

### 3e) Zip Codes

In [5]:
df_zip = pd.read_csv('zip_pop.csv', dtype = {'zip': str, 'population': int})
df_zip.head()

Unnamed: 0,zip,population
0,1001,16769
1,1002,29049
2,1003,10372
3,1005,5079
4,1007,14649


Instead of looping, try using a clever groupby. For example, replace A, B, C with the first three digits of a zipcode:

![](groupby_overview.png)

In [31]:
df = pd.read_csv('enrollments.csv')[['Term', 'Number', 'Enrollment Cnt']]
df

Unnamed: 0,Term,Number,Enrollment Cnt
0,2018 Spring,9A,8
1,2018 Spring,9C,11
2,2018 Spring,9E,36
3,2018 Spring,9F,33
4,2018 Spring,9G,11
...,...,...,...
2519,2012 Spring,248,39
2520,2012 Spring,C261,8
2521,2012 Spring,272,7
2522,2012 Spring,278B,8


In [25]:
df.sort_values('Enrollment Cnt', ascending=False)

Unnamed: 0,Term,Number,Enrollment Cnt
14,2018 Spring,61B,1379
377,2017 Spring,61B,1328
12,2018 Spring,61A,1210
723,2016 Spring,61B,1156
309,2018 Spring,C8,1043
...,...,...,...
456,2017 Spring,294,1
457,2017 Spring,294,1
2250,2012 Spring,298,1
459,2017 Spring,298,1


In [26]:
small = df.sort_values('Enrollment Cnt', ascending=False)[:10]
small

Unnamed: 0,Term,Number,Enrollment Cnt
14,2018 Spring,61B,1379
377,2017 Spring,61B,1328
12,2018 Spring,61A,1210
723,2016 Spring,61B,1156
309,2018 Spring,C8,1043
1095,2015 Spring,61A,981
376,2017 Spring,61A,965
1468,2014 Spring,61B,944
721,2016 Spring,61A,877
109,2018 Spring,16A,825


In [27]:
small.groupby('Term').sum()

Unnamed: 0_level_0,Enrollment Cnt
Term,Unnamed: 1_level_1
2014 Spring,944
2015 Spring,981
2016 Spring,2033
2017 Spring,2293
2018 Spring,4457


In [36]:
small.assign(year=small['Term'].str[:4])

Unnamed: 0,Term,Number,Enrollment Cnt,year
14,2018 Spring,61B,1379,2018
377,2017 Spring,61B,1328,2017
12,2018 Spring,61A,1210,2018
723,2016 Spring,61B,1156,2016
309,2018 Spring,C8,1043,2018
1095,2015 Spring,61A,981,2015
376,2017 Spring,61A,965,2017
1468,2014 Spring,61B,944,2014
721,2016 Spring,61A,877,2016
109,2018 Spring,16A,825,2018


In [38]:
(small
 .assign(year=small['Term'].str[:4])
 .groupby('year')
 .sum()
)

Unnamed: 0_level_0,Enrollment Cnt
year,Unnamed: 1_level_1
2014,944
2015,981
2016,2033
2017,2293
2018,4457


In [40]:
counts = (small
 .assign(num=small['Number'].str[:-1])
 .groupby('num')
 .sum()
)
counts

Unnamed: 0_level_0,Enrollment Cnt
num,Unnamed: 1_level_1
16,825
61,8840
C,1043


In [41]:
dict(counts['Enrollment Cnt'])

{'16': 825, '61': 8840, 'C': 1043}