Python Workshop 1: basic coding and exploratory data analysis

https://colab.research.google.com/

In [None]:
# With Python, you can do prety much everything you can do in R -- and more! 
# All in relatively few lines of code
# Today we will cover the Python code to achieve results similar to what previous R workshops have focused on
# and also explore some Python idiosyncracies and useful functions, tips, and tricks

# In the notebook environment, output is displayed directly below the code block it is in 
print('Hello world!')

In [None]:
# We actually don't even need to use print() to see our output when using a notebook
'Hello world!'

In [None]:
# Just like R uses packages to extend its base functionality, Python has libraries 
# Let's import the libraries we will use today -- aliases make calling functions from the library simpler

# Importing libraries
# import [library] as [alias]
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Let's save some simple variables 
# In a notebook, any variable defined like this is accessible in the global environment 
# (i.e., we can use the value of this variable in any other code block in our notebook)
x = 'this is a string'
y = 7
z = 3.14

In [None]:
# The type() function tells us the type (i.e., class) of object it considers 
type(x)
type(y)
type(z)

In [None]:
# If we want to have multiple lines of output, we need to print each line 
# The notebook will otherwise just give us the output of our final line of code 
# We can also change the datatype of the two numbers to a string so we can print full sentences describing our variables
# \ is the escape character in Python -- use it before quotations to include quotes in your printed string

print('\"' + x + '\" is a ' + str(type(x)) + '.')
print('\"' + str(y) + '\" is a ' + str(type(y)) + '.')
print('\"' + str(z) + '\" is a ' + str(type(z)) + '.')

In [None]:
# Notice that we copied and pasted several times above
# Whenever we are copying and pasting, we are defeating one of the main points of coding: efficiency! 

# Let's instead make a simple loop to achieve the same output as above in fewer lines 
# First, we'll put our variables in a list (notice how multiple data types can be in a list)
var_list = [x,y,z]
type(var_list)

In [None]:
# Now we'll loop through our list to print the senteces equivalent to what we have above
for variable in var_list: 
    print('\"' + str(variable) + '\" is a ' + str(type(variable)) + '.')

In [None]:
# We might want to use this pattern again, so let's make it a function 
def var_type_phrase(var): 
    return '\"' + str(var) + '\" is a ' + str(type(var)) + '.'

In [None]:
for variable in var_list:
    print(var_type_phrase(variable))

In [None]:
# Let's now take a step back and go over some key Python terms
# When we call the type() function, we are seeing what the <class> of an individual <object> is
# Individual objects can have <attributes> specific to the object's class 
# Individual objects can also have <methods> which are defined by the object's class 
# (think of methods like class-specific functions)

# Let's see an example of this
# First, make a list of numbers
num_list = [1,2,5,8]
print(num_list)

# Our num_list object is indeed of the list class
print(type(num_list))

# A max() function exists to find the maximum value of our list 
# This is NOT a method of the list class -- just a regular function 
print(max(num_list))
print('\n')

# We can change the object type by creating a numpy array out of our list 
num_array = np.array(num_list)
print(num_array)

# Let's check the type 
print(type(num_array))

# We can also use the max() function 
print(max(num_array))

# But a max() method also exists for objects of the numpy array class that gives the same result!
print(num_array.max())

# attributes are more like properties of the object
# shape is an attribute of numpy arrays 
print(num_array.shape)
print('\n')

# With methods, we can change attributes 
num_array_reshaped = num_array.reshape(2,2)
print(num_array_reshaped)
print(num_array_reshaped.shape)

In [None]:
# One more more useful Python programming thing before we dive into some data!
# Suppose that you have a list of objects and you want to figure out how many even integers there are in that list
# The modulo % returns the remainder of y divided by x if given y % x, so if y % 2 == 0, y is even  
4 % 2 

In [None]:
5 % 2

In [None]:
potential_evens = [2,4,5,1,8,9,48]
true_evens = []
odds = []

for num in potential_evens:
    if num % 2 == 0:
        true_evens.append(num)
    else:
        odds.append(num)

print(true_evens)
print(len(true_evens))
print(odds)

In [None]:
# But in Python, there is an even better, shorter, faster, more "pythonic" way to do this!
# It's called a list comprehension
true_evens = [num for num in potential_evens if num % 2 == 0]
true_evens

In [None]:
# What if, however, our list of potential_evens contains objects that are not even numbers?
potential_evens = [2,4,5,1,8,9,48,'bus']
true_evens = []
not_evens = []

for obj in potential_evens:
    if obj % 2 == 0:
        true_evens.append(obj)
    else:
        not_evens.append(obj)

In [None]:
# We can use a try-except statement to deal with an error if it comes up
# This is not really possible to do well with a list comprehension 
potential_evens = [2,4,5,1,8,9,48,'bus']
true_evens = []
not_evens = []

for obj in potential_evens:
    try:
        if obj % 2 == 0:
            true_evens.append(obj)
        else:
            not_evens.append(obj)
    except:
        not_evens.append(obj)
        
print(true_evens)
print(not_evens)

In [None]:
# We will be looking at some Census data on healthcare trends 
# We can turn this into a nice panel dataset -- there are cross-sectional observations 
# at regularly spaced (yearly) intervals

# Importing data from a csv using the read_csv function from pandas (recall that pd is the pandas alias)
df = pd.read_csv('../All_years.csv')

# Look at the first five rows with the head() method 
df.head()

In [None]:
# If using Google Colab, add this file to your Google Drive:
# https://drive.google.com/file/d/1C4H8DUVjaFt2fzXIfOMaa2Pjmd0n5sLe/view

# Click through anything that pops up and follow instructions
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Make the path after 'drive/MyDrive/' the path to where you have the file stored
df = pd.read_csv('drive/MyDrive/Oeconomica/WorkshopMaterials2020-2021/PythonWorkshops/All_years.csv')

df.head()

In [None]:
# We can also look at the first 10 rows if we want 
df.head(10)

In [None]:
# Or the last ten 
df.tail(10)

In [None]:
# The info() method lists the variables and datatype of each variable
df.info()

In [None]:
# The describe() method gives summary statistics for all relevant variables 
df.describe()

In [None]:
# The columns attribute lists the columns of our dataframe 
# list() will turn the "list" that df.columns gives into an actual list object
list(df.columns)

In [None]:
# Let's do something straightforward (conceptually)
# We want to know the share of people each year with health insurance 
# perwt is a person sample weight that tells us how many individuals each "observation" represents 
df['hcovany'] = df['hcovany'].apply(lambda x: x - 1)
df['hcovany_wt'] = df['hcovany'] * df['perwt']
df.head()

In [None]:
# Just like we would with dplyr in R, we can use pandas to groupby and summarize variables 
df_covshare_1 = df.groupby('year')['perwt'].sum()
df_covshare_2 = df.groupby('year')['hcovany_wt'].sum()
df_covshare = pd.DataFrame([df_covshare_1,df_covshare_2]).T
df_covshare.columns = ['population','num_covered']
df_covshare['share_covered'] = df_covshare['num_covered'] / df_covshare['population']
df_covshare

In [None]:
# Since we have the year as the index, we can select the column we want to plot as a time series
df_covshare['share_covered'].plot()

In [None]:
# We can add more information 
plt.style.use('fivethirtyeight')
plt.figure(figsize = (12,6))
plt.plot('share_covered', data=df_covshare, color = 'navy')
plt.xlabel('Year')
plt.ylabel('Share of US population with healthcare coverage')
plt.title('Health insurance coverage')
plt.show()

In [None]:
# Exercise: 
# Try on your own to make a function that takes a single variable from the raw df, provides some meaningful 
# time series summary, and creates a plot, all in one!