In [None]:
import sys
print("Python Version:", sys.version, '\n')

# Pickle: Saving Objects for Later

Often in data science, we'll create some model or some version of our data and want to use it later. We have many options - we can save the coefficients, or save the data to csv, or...

Actually, we don't have that many options. 

One way to overcome that is to save the python object to a file as a serialized object. That means we convert the entire object to a bunch of bytes, save those bytes into a file, and then have the ability to unpack those bytes back into their original format later. 

This is done by a module called `pickle`. Let's see it in action.

In [1]:
import pickle
import random

lots_of_noise = {
    'CA': [random.randint(0,65) for _ in range(100)],
    'IL': [random.randint(0,65) for _ in range(50)],
    'NY': [random.randint(0,65) for _ in range(90)],
    'WA': [random.randint(0,65) for _ in range(33)]
}

In [2]:
print(lots_of_noise)

{'CA': [42, 35, 23, 26, 0, 65, 59, 12, 34, 34, 60, 62, 5, 32, 12, 15, 0, 62, 2, 27, 2, 3, 7, 46, 16, 14, 5, 14, 40, 57, 29, 53, 44, 7, 60, 44, 40, 42, 9, 5, 43, 35, 56, 10, 60, 12, 21, 23, 49, 55, 24, 7, 46, 5, 26, 10, 19, 47, 50, 54, 43, 33, 18, 36, 16, 40, 31, 63, 34, 14, 32, 38, 15, 25, 54, 57, 59, 52, 57, 25, 55, 36, 11, 22, 30, 34, 50, 52, 35, 8, 13, 38, 22, 8, 61, 25, 27, 53, 20, 29], 'IL': [13, 50, 32, 9, 34, 42, 9, 55, 37, 61, 20, 63, 61, 54, 60, 16, 25, 17, 40, 43, 7, 3, 19, 25, 12, 38, 8, 7, 10, 17, 0, 6, 28, 25, 31, 52, 2, 37, 29, 9, 52, 61, 41, 47, 32, 13, 31, 35, 9, 45], 'NY': [15, 50, 13, 17, 61, 5, 8, 50, 4, 5, 37, 12, 30, 21, 41, 28, 41, 35, 30, 38, 38, 21, 27, 7, 25, 29, 9, 23, 25, 0, 53, 64, 36, 27, 51, 29, 53, 48, 23, 57, 32, 17, 39, 48, 27, 20, 65, 38, 16, 41, 46, 25, 27, 40, 38, 48, 43, 16, 63, 16, 30, 23, 46, 27, 65, 63, 51, 2, 1, 9, 3, 42, 40, 41, 27, 31, 45, 35, 34, 29, 49, 23, 17, 47, 45, 1, 64, 56, 51, 25], 'WA': [55, 27, 57, 26, 39, 7, 0, 35, 32, 56, 48, 62, 

In [3]:
whos

Variable        Type      Data/Info
-----------------------------------
lots_of_noise   dict      n=4
pickle          module    <module 'pickle' from '/U<...>lib/python3.7/pickle.py'>
random          module    <module 'random' from '/U<...>lib/python3.7/random.py'>


We can see in this `whos` command that the object `lots_of_noise` exists and is a `dict` with 4 keys. Nice. Now let's look at our file system and verify that there isn't a file called `noise.pickle`.

In [15]:
!ls

advanced_python_datatypes.ipynb       my_dataframe.pickle
deep_and_shallow_copy.ipynb           noise.pickle
[34mdeep_copy_demo[m[m                        pickle_saving_objects_for_later.ipynb
instructor_guide_week1_day4.md        readme.md


Okay, now we're ready to create a file and write the bytes to it. To do this with `pickle`, we use python's read-write streamer `open` and create a writable-binary (`wb`) file. We'll then use `pickle.dump` to put an object into that file as a string of bytes.

In [5]:
with open('noise.pickle', 'wb') as to_write:
    pickle.dump(lots_of_noise, to_write)

In [6]:
!ls

advanced_python_datatypes.ipynb       my_dataframe.pickle
deep_and_shallow_copy.ipynb           noise.pickle
[34mdeep_copy_demo[m[m                        pickle_saving_objects_for_later.ipynb
instructor_guide_week1_day4.md        readme.md


Now let's delete `lots_of_noise` and prove to ourselves it doesn't exist in Python's memory anymore.

In [7]:
del lots_of_noise

In [13]:
whos

Variable    Type              Data/Info
---------------------------------------
new_noise   dict              n=4
pickle      module            <module 'pickle' from '/U<...>lib/python3.7/pickle.py'>
random      module            <module 'random' from '/U<...>lib/python3.7/random.py'>
read_file   BufferedReader    <_io.BufferedReader name='noise.pickle'>
to_write    BufferedWriter    <_io.BufferedWriter name='noise.pickle'>


In [14]:
print(lots_of_noise)

NameError: name 'lots_of_noise' is not defined

Lovely. It's dead forever. Or is it?

Let's open that `noise.pickle` file with read-binary (`rb`) mode. Then we'll ask pickle to retrieve the file with `pickle.load` and store it back in a variable.

In [10]:
with open('noise.pickle','rb') as read_file:
    new_noise = pickle.load(read_file)

In [11]:
print(new_noise)

{'CA': [42, 35, 23, 26, 0, 65, 59, 12, 34, 34, 60, 62, 5, 32, 12, 15, 0, 62, 2, 27, 2, 3, 7, 46, 16, 14, 5, 14, 40, 57, 29, 53, 44, 7, 60, 44, 40, 42, 9, 5, 43, 35, 56, 10, 60, 12, 21, 23, 49, 55, 24, 7, 46, 5, 26, 10, 19, 47, 50, 54, 43, 33, 18, 36, 16, 40, 31, 63, 34, 14, 32, 38, 15, 25, 54, 57, 59, 52, 57, 25, 55, 36, 11, 22, 30, 34, 50, 52, 35, 8, 13, 38, 22, 8, 61, 25, 27, 53, 20, 29], 'IL': [13, 50, 32, 9, 34, 42, 9, 55, 37, 61, 20, 63, 61, 54, 60, 16, 25, 17, 40, 43, 7, 3, 19, 25, 12, 38, 8, 7, 10, 17, 0, 6, 28, 25, 31, 52, 2, 37, 29, 9, 52, 61, 41, 47, 32, 13, 31, 35, 9, 45], 'NY': [15, 50, 13, 17, 61, 5, 8, 50, 4, 5, 37, 12, 30, 21, 41, 28, 41, 35, 30, 38, 38, 21, 27, 7, 25, 29, 9, 23, 25, 0, 53, 64, 36, 27, 51, 29, 53, 48, 23, 57, 32, 17, 39, 48, 27, 20, 65, 38, 16, 41, 46, 25, 27, 40, 38, 48, 43, 16, 63, 16, 30, 23, 46, 27, 65, 63, 51, 2, 1, 9, 3, 42, 40, 41, 27, 31, 45, 35, 34, 29, 49, 23, 17, 47, 45, 1, 64, 56, 51, 25], 'WA': [55, 27, 57, 26, 39, 7, 0, 35, 32, 56, 48, 62, 

In [12]:
whos

Variable    Type              Data/Info
---------------------------------------
new_noise   dict              n=4
pickle      module            <module 'pickle' from '/U<...>lib/python3.7/pickle.py'>
random      module            <module 'random' from '/U<...>lib/python3.7/random.py'>
read_file   BufferedReader    <_io.BufferedReader name='noise.pickle'>
to_write    BufferedWriter    <_io.BufferedWriter name='noise.pickle'>


Random noise lives! We retrieved the entire structure from file. Nice.

### Okay, but I don't use dictionaries... I use pandas.

In [16]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.uniform(-10,10, size=(100,4)), columns=['Yay','specific','column','names'])
df.head(5)

Unnamed: 0,Yay,specific,column,names
0,-1.469065,-7.372281,8.725529,-2.084415
1,-1.390369,1.929408,-6.002177,-5.791695
2,4.713555,5.680526,0.864767,2.10448
3,-9.439896,4.115723,-1.39459,-9.27577
4,8.364742,-8.518455,3.256643,-4.690516


In [17]:
with open('my_dataframe.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

In [18]:
del df

df.head(5)

NameError: name 'df' is not defined

In [19]:
with open('my_dataframe.pickle','rb') as read_file:
    new_df = pickle.load(read_file)
    
new_df.head(5)

Unnamed: 0,Yay,specific,column,names
0,-1.469065,-7.372281,8.725529,-2.084415
1,-1.390369,1.929408,-6.002177,-5.791695
2,4.713555,5.680526,0.864767,2.10448
3,-9.439896,4.115723,-1.39459,-9.27577
4,8.364742,-8.518455,3.256643,-4.690516


Pickle is a great tool. One recommended way of using it is to make it an end point of every step in your process. Example:

* I got my data! Nice. Pickle it and stop your "getting the data" notebook.
* Load your data from pickle. Clean it. Save your clean data to a new pickle.
* Load your cleaned_data pickle. Do analysis and visualize it.

This can provide natural "pick-up-where-I-left-off-but-before-I-broke-my-data" points. It's a great way to control the flow of your data.

#### Resources

https://docs.python.org/3.7/library/pickle.html

In [121]:
import csv
from collections import defaultdict
import re

def count_titles(csv_file_name):
    #count={}
    count = defaultdict(int)
    with open(csv_file_name) as csvfile:
        readCSV = csv.reader(csvfile)
        titlelist=[]
        for row in readCSV:
            #print(row)
            
            titlelist.append(row[2])
        titlelist.remove(' title')
        

        newlist=[re.sub('[^a-zA-Z0]', "", d) for d in titlelist] #remove all non a-zA-z0 characters,
        #rem_under_list=[re.sub("(\w)([A-Z]+), "", d) for d in rem_space_period_list]
        #print(newlist)

        newlist1=[re.sub('AssistantProfessor.*Biostatistics', "AssistantProfessorBiostatistics", d) for d in newlist] #account for typo
        for degree in newlist1:  #defaultdict count method
            count[degree]+=1
        #print(count)
        return(count)
                       
           


degreecounts = count_titles('faculty.csv')

defaultdict(<class 'int'>, {'AssociateProfessorofBiostatistics': 12, 'ProfessorofBiostatistics': 13, 'AssistantProfessorofBiostatistics': 11, 'AssistantProfessorisBiostatistics': 1})


In [None]:
def count_degrees(csv_file_name):
    #count={}
    count = defaultdict(int)
    with open(csv_file_name) as csvfile:
        readCSV = csv.reader(csvfile)
        degreelist=[]
        for row in readCSV:
            #print(row)
            
            degreelist.append(row[1])
        degreelist.remove(' degree')
        #print(degreelist)
        n=[]
        for d in degreelist:
            n.append(re.split('\s',d)) #split the long degree strings
        #print(n)
        
        flattened = [val for sublist in n for val in sublist]
        #print(flattened)
        test_list = [i for i in flattened if i]  #remove all empty strings, which evaluate to false in if statement

        #print(test_list)

        newlist=[re.sub('[^a-zA-Z0]', "", d) for d in test_list] #remove all non a-zA-z0 characters,
        #rem_under_list=[re.sub("(\w)([A-Z]+), "", d) for d in rem_space_period_list]
        #print(newlist)
        for degree in newlist:  #defaultdict count method
            count[degree]+=1
        #print(count)
        return(count)


In [None]:
import csv
from collections import defaultdict
import re


def emails(csv_file_name):
    count = defaultdict(int)
    with open(csv_file_name) as csvfile:
        readCSV = csv.reader(csvfile)
        emaillist=[]
        for row in readCSV:
            #print(row)
            
            emaillist.append(row[3])
        emaillist.remove(' email')
        
    return(emaillist)

In [145]:
import csv


def write_to_csv(list_of_emails):
    with open("emails.csv",'w') as resultFile:
        wr = csv.writer(resultFile,delimiter='\n')
        wr.writerow(["list_of_emails"])
        wr.writerow(list_of_emails)
    
write_to_csv(['a','b','c'])

In [257]:
import csv 
import re
from collections import defaultdict



def get_dict():
    with open('faculty.csv') as csvfile:
        readCSV=csv.reader(csvfile)
        LNlist=[]
        LNdict=defaultdict(list)
        for row in list(readCSV)[1:]: #convert to list to skip first row of labels
            #print(row[0])
            x = re.sub(".* ", "", row[0]) #iterates throguh each string atrow[0] and replaces all spaces and everything in front with nothing
            LNlist.append(x)
        #print(LNlist)
            LNlist=list(set(LNlist)) #remove duplicates
            
            
            for l in LNlist:
                copylist=[]
                if l==x:
                    copylist.append(row[1:])
                    LNdict[l]+=copylist
                #print(l)
        #print(LNlist)
        return(LNdict)
        #nonamelist=[]
        #for row in readCSV:
        #    print(row[0])
            #nonamelist.append(row[0])
        #print(nonamelist)
        
get_dict()

defaultdict(list,
            {'Bellamy': [[' Sc.D.',
               'Associate Professor of Biostatistics',
               'bellamys@mail.med.upenn.edu']],
             'Bilker': [['Ph.D.',
               'Professor of Biostatistics',
               'warren@upenn.edu']],
             'Bryan': [[' PhD',
               'Assistant Professor of Biostatistics',
               'bryanma@upenn.edu']],
             'Chen': [[' Ph.D.',
               'Associate Professor of Biostatistics',
               'jinboche@upenn.edu']],
             'Ellenberg': [[' Ph.D.',
               'Professor of Biostatistics',
               'sellenbe@upenn.edu'],
              [' Ph.D.',
               'Professor of Biostatistics',
               'jellenbe@mail.med.upenn.edu']],
             'Feng': [[' Ph.D',
               'Assistant Professor of Biostatistics',
               'ruifeng@upenn.edu']],
             'French': [[' PhD',
               'Associate Professor of Biostatistics',
               'bcfrenc

In [261]:
import csv 
import re
from collections import defaultdict



def get_dict():
    with open('faculty.csv') as csvfile:
        readCSV=csv.reader(csvfile)
        LNlist=[]
        LNdict=defaultdict(list)
        for row in list(readCSV)[1:]: #convert to list to skip first row of labels
            #print(row[0])
            x = re.split(" ", row[0])
            x=tuple(x)
            LNlist.append(x)
        #print(LNlist)
            LNlist=list(set(LNlist)) #remove duplicates
            
            
            for l in LNlist:
                
                if l==x:
                  
                    LNdict[l]+=row[1:]
   
        return(LNdict)

        
get_dict()

defaultdict(list,
            {('Scarlett', 'L.', 'Bellamy'): [' Sc.D.',
              'Associate Professor of Biostatistics',
              'bellamys@mail.med.upenn.edu'],
             ('Warren', 'B.', 'Bilker'): ['Ph.D.',
              'Professor of Biostatistics',
              'warren@upenn.edu'],
             ('Matthew', 'W', 'Bryan'): [' PhD',
              'Assistant Professor of Biostatistics',
              'bryanma@upenn.edu'],
             ('Jinbo', 'Chen'): [' Ph.D.',
              'Associate Professor of Biostatistics',
              'jinboche@upenn.edu'],
             ('Susan', 'S', 'Ellenberg'): [' Ph.D.',
              'Professor of Biostatistics',
              'sellenbe@upenn.edu'],
             ('Jonas', 'H.', 'Ellenberg'): [' Ph.D.',
              'Professor of Biostatistics',
              'jellenbe@mail.med.upenn.edu'],
             ('Rui', 'Feng'): [' Ph.D',
              'Assistant Professor of Biostatistics',
              'ruifeng@upenn.edu'],
             ('B