# Intermediate Python
Run the hidden code cell below to import the data used in this course.

In [14]:
# Import the course packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the two datasets
gapminder = pd.read_csv("datasets/gapminder.csv")
brics = pd.read_csv("datasets/brics.csv")

## Dictionaries

In [13]:
# Typical Dictionary Format
# Countries here are the keys, and correspond to a value
world = {'afghanistan':'1.33', 'russia':'2.83'}
print(world['afghanistan'])
# The below method returns all the keys in a dictionary
print(world.keys())

# Adding elements in a dictionary. You will also use this pattern to update a present value in dictionary
world['sealand'] = 0.00027

# Checking for a key in dictionary
'sealand' in world # will return True

#Removing element in dictionary
del(world['sealand'])

world

1.33
dict_keys(['afghanistan', 'russia'])


{'afghanistan': '1.33', 'russia': '2.83'}

In [None]:
# Filtering with Boolean Operators
np.logical_and(brics['area']>8, brics['area']<10)
# This will return Boolean values that can be used to subset brics DataFrame
brics[np.logical_and(brics['area']>8, brics['area']<10)]

## Remember that keys are immutable. The contents of keys cannot be changed once created. You cannot use lists in dictionaries, because they are mutable 

## Dictionary of Dictionaries

In [15]:
# Dictionary of dictionaries
europe = { 'spain': { 'capital':'madrid', 'population':46.77 },
           'france': { 'capital':'paris', 'population':66.03 },
           'germany': { 'capital':'berlin', 'population':80.62 },
           'norway': { 'capital':'oslo', 'population':5.084 } }


# Print out the capital of France
print(europe['france']['population'])

# Create sub-dictionary data
data = {'capital':'rome', 'population':59.83}

# Add data to europe under key 'italy'
europe['italy'] = data

europe

66.03


{'spain': {'capital': 'madrid', 'population': 46.77},
 'france': {'capital': 'paris', 'population': 66.03},
 'germany': {'capital': 'berlin', 'population': 80.62},
 'norway': {'capital': 'oslo', 'population': 5.084},
 'italy': {'capital': 'rome', 'population': 59.83}}

# pandas

## Dictionary to Dataframe

In [None]:
# Importing csv to DataFrame
# Import the cars.csv data: cars. Index column argument is set to first column
cars = pd.read_csv('cars.csv', index_col=0)

# Pre-defined lists
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

# Import pandas as pd
import pandas as pd

# Create dictionary my_dict with three key:value pairs: my_dict
my_dict = {'country':names, 'drives_right':dr, 'cars_per_cap':cpc}

# Build a DataFrame cars from my_dict: cars
cars = pd.DataFrame(my_dict)

# Definition of row labels
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']

# Set row labels of cars
cars.index = row_labels

# Print cars
print(cars)

In [17]:
brics = pd.DataFrame(brics)

## Indexing and Selecting Data with Pandas and DataFrames

### Square Brackets
Square brackets are pretty limited in terms of functionality; you'd rather have something with an output of 2D NumPy array, which you get more with the loc and iloc functions

If you select a column using square brackets like with the code below, you get a Pandas series instead of a regular DataFrame.
A series is like a 1-dimensional array that can be labelled, like a DataFrame. If you paste together a bunch of series, you get a DataFrame. 

In [None]:
# where brics is the table and 'country' is a column and get a series
brics['country']

If you want to get the column data but keep it in a DataFrame, you'll need double brackets as seen below, which creates a sub DataFrame.
You can also pull out rows using a slice. 

In [None]:
brics[['country']]
# You can enter a list of columns to pull multiple columns
brics[['country', 'capital']]
# slice for pulling rows 1 through 4 (excluding 4)
brics[1:4]

### loc and iloc pandas function
loc is label-based. They use the row labels, and you can use loc for pulling both rows and columns
iloc is integet-position based

In [None]:
brics.loc[["RU", "IN"]] #row access
brics.loc[:, ['country', 'capital']] # column access
brics.loc[ # row and column access
    ["RU", "IN"], 
    ['country', 'capital']
]

# iloc uses the numerical positions of rows and columns
brics.iloc[[1, 2, 3]] # row access
brics.iloc[:, [1, 2]] # column access
brics.iloc[[1, 2, 3], [0, 1]] # row and column access

# Comparison Operators
You can use comparison operators to compare arrays

In [6]:
# Create arrays
import numpy as np
my_house = np.array([18.0, 20.0, 10.75, 9.50])
your_house = np.array([14.0, 24.0, 14.25, 9.0])

# my_house greater than or equal to 18
print(my_house >= 18)

# my_house less than your_house
print(my_house < your_house)

[ True  True False False]
[False  True  True False]


# Boolean Operators
To use and, or, and not with arrays, you need to use the following NumPy functions:
logical_and()
logical_or()
logical_not()

In [9]:
# Define variables
my_kitchen = 18.0
your_kitchen = 14.0

# my_kitchen bigger than 10 and smaller than 18?
print(my_kitchen > 10 and my_kitchen < 18)

False


In [8]:
# Create arrays
import numpy as np
my_house = np.array([18.0, 20.0, 10.75, 9.50])
your_house = np.array([14.0, 24.0, 14.25, 9.0])

# my_house greater than 18.5 or smaller than 10
print(np.logical_or(my_house > 18.5, my_house < 10))

# Both my_house and your_house smaller than 11
print(np.logical_and(my_house < 11, your_house < 11))

# Call this to get the values that meet the operator specifications
print(my_house[np.logical_and(my_house>10.0, my_house<19.0)])

[False  True False  True]
[False False False  True]
[18.   10.75]


## if, else, and elif
Here's a general structure for this control flow:

In [12]:
#if condition :
#    expression
#elif condition :
#    expression
#else :
#    expression

In [11]:
area = 10.0
if(area < 9) :
    print("small")
elif(area < 12) :
    print("medium")
else :
    print("large")

medium


## Filtering pandas DataFrames
To select countries with area with greater than 8 million, you can use several methods for filtering. 
One important is that we need the series, not a DataFrame...?

In [20]:
# Calling in multiple lines
is_huge = brics['area'] > 8
# This would return a Boolean series saved under variable is_huge, which can be used to subset the pandas DataFrame
big_countries = brics[is_huge]
print(big_countries)

  country_ab country   capital    area  population
0         BR  Brazil  Brasilia   8.516       200.4
1         RU  Russia    Moscow  17.100       143.5
3         CH   China   Beijing   9.597      1357.0


In [21]:
# Written as a one-liner
big_countries = brics[brics['area']>8]
print(big_countries)

  country_ab country   capital    area  population
0         BR  Brazil  Brasilia   8.516       200.4
1         RU  Russia    Moscow  17.100       143.5
3         CH   China   Beijing   9.597      1357.0


## While Loop
while condition :
	expression

## For Loop
for var in seq :
	expression
    
If you want to include the index number of the elements you'e iterating over, you can use enumerate to return the index of the value and the value itself

In [23]:
# areas list
areas = [11.25, 18.0, 20.0, 10.75, 9.50]

# Change for loop to use enumerate() and update print()
for index, a in enumerate(areas) :
    print("room " + str(index) + ": " + str(a))

room 0: 11.25
room 1: 18.0
room 2: 20.0
room 3: 10.75
room 4: 9.5


In [22]:
# Cool trick
for c in 'family' :
    print(c.capitalize())

F
A
M
I
L
Y


## Loop Data Structures
Looping over dictionaires and NumPy arrays is different than looping over strings and lists. The way you define the sequence you're iterating over varies depending on the data structure.

If you're looping over a dictionary, you can use the .items() method on the dictionary to generate a key and value for each iteration. But because dicitionaries are inherently unordered, they will not come up in like an alphabetical order or anything
for key, val in my_dict.items() :

For NumPy array, you can use np.nditer() function to iterate over all the elements in a nd array separately 
for val in np.nditer(my_array) :

In [None]:
# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin',
          'norway':'oslo', 'italy':'rome', 'poland':'warsaw', 'austria':'vienna' }
          
# Iterate over europe
for key, value in europe.items() : # you could do k, v instead of key, value
    print(" the capital of " + key + " is " + value)

In [None]:
# Import numpy as np
import numpy as np

# For loop over np_height
for x in np.nditer(np_height) :
    print(str(x) + " inches")

# For loop over np_baseball
for x in np.nditer(np_baseball) :
    print(x)

For DataFrames, you can use .iterrows to iterate over the rows. It generates the label of the row and the data from the row as a Pandas Series. You can use the techniques for subsetting series to get additional information

In [None]:
# Iterate over rows of cars
for lab, row in cars.iterrows() :
    print(lab)
    print(row) 

## Generating a new column based on other columns
Use .apply()

In [None]:
# Use .apply(str.upper)
cars["COUNTRY"] = cars['country'].apply(str.upper)
print(cars)

### There's another complicated method using a for loop

In [None]:
# Code for loop that adds COUNTRY column
for lab, row in cars.iterrows() :
    cars.loc[lab, "COUNTRY"] = row['country'].upper()

## Explore Datasets
Use the DataFrames imported in the first cell to explore the data and practice your skills!
- Create a loop that iterates through the `brics` DataFrame and prints "The population of {country} is {population} million!". 
- Create a histogram of the life expectancies for countries in Africa in the `gapminder` DataFrame. Make sure your plot has a title, axis labels, and has an appropriate number of bins.
- Simulate 10 rolls of two six-sided dice. If the two dice add up to 7 or 11, print "A win!". If the two dice add up to 2, 3, or 12, print "A loss!". If the two dice add up to any other number, print "Roll again!". 