In [None]:
## Data Manipulation with Pandas

import pandas as pd

file_path = 'Summer Olympic medallists 1896 to 2008 - EDITIONS.tsv'
editions = pd.read_csv(file_path, sep='\t')
editions = editions[['Edition', 'Grand Total', 'City', 'Country']]

ioc_codes = pd.read_csv(file_path)
ioc_codes = ioc_codes[["Country", "NOC"]]

# Create empty dictionary: medals_dict
medals_dict = {}

for year in editions['Edition']:
    file_path = 'summer_{:d}.csv'.format(year)
    medals_dict[year] = pd.read_csv(file_path)
    medals_dict[year] = medals_dict[year][['Athlete','NOC', 'Medal']]
    medals_dict[year]['Edition'] = year
    
# Concatenate medals_dict: medals
medals = pd.concat(medals_dict, ignore_index=True)

# Construct the pivot_table: medal_counts
medal_counts = medals.pivot_table(index = "Edition", values="Athlete", columns = "NOC", aggfunc="count")

# Set Index of editions: totals
totals = editions.set_index("Edition")
totals = totals['Grand Total']
fractions = medal_counts.divide(totals, axis = "rows")

# Left join editions and ioc_codes: hosts
hosts = pd.merge(editions, ioc_codes, how = "left", on = "Country")

# Extract relevant columns and set index: hosts
hosts = hosts[["Edition", "NOC"]].set_index("Edition")

# Fix missing 'NOC' values of hosts
print(hosts.loc[hosts.NOC.isnull()])
hosts.loc[1972, 'NOC'] = 'FRG'
hosts.loc[1980, 'NOC'] = 'URS'
hosts.loc[1988, 'NOC'] = 'KOR'

hosts = hosts.reset_index()
reshaped = pd.melt(fractions_change, id_vars='Edition', value_name='Change')
print(reshaped.shape, fractions_change.shape)
chn = reshaped[reshaped["NOC"] == 'CHN']
merged = pd.merge(reshaped, hosts, how = "inner")
influence = merged.set_index("Edition").sort_index()

print(influence.head())

# Import pyplot
import matplotlib.pyplot as plt

change = influence["Change"]
ax = change.plot(kind ="bar")

# Customize the plot to improve readability
ax.set_ylabel("% Change of Host Country Medal Count")
ax.set_title("Is there a Host Country Advantage?")
ax.set_xticklabels(editions['City'])

# Display the plot
plt.show()

In [None]:
## Intermediate Importing Data in Python

**Import Data From Web**
## 1. Using request
# Method 1 
from urllib.request import urlretrieve
import pandas as pd

url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
urlretrieve(url, 'winequality-red.csv')
df = pd.read_csv('winequality-red.csv', sep=';')
print(df.head())

# Method 2
from urllib.request import urlopen, Request
url = "http://www.datacamp.com/teach/documentation"

request = Request(url)
response = urlopen(request)
print(type(response))
response.close()

## 2. Using BeautifulSoup
import requests
from bs4 import BeautifulSoup

url = 'https://www.python.org/~guido/'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
pretty_soup = soup.prettify()
print(pretty_soup)

## 3. Using BeautifulSoup to get text
import requests
from bs4 import BeautifulSoup

url = 'https://www.python.org/~guido/'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
guido_title = soup.title
print(guido_title)

# Get Guido's text: guido_text
guido_text = soup.get_text()
print(guido_text)


**API**
# Assign URL to variable: url
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'
r = requests.get(url)
json_data = r.json()

# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)

**Twitter**
# Import package
import tweepy, json

# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

In [None]:
## Functions 

# Define count_entries()
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    # Raise a ValueError if col_name is NOT in DataFrame
    if col_name not in df.columns:
        raise ValueError ('The DataFrame does not have a ' + col_name + ' column.')

    cols_count = {}   
    col = df[col_name]
    
    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
       
    return cols_count

# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')

# Print result1
print(result1)

In [None]:
## Python Data Science Toolbox

zipped_lists = zip(feature_names, row_vals)
rs_dict = dict(zipped_lists)

def lists2dict(list1, list2):
    zipped_lists = zip(list1, list2)
    rs_dict = dict(zipped_lists)
    return rs_dict

rs_fxn = lists2dict(feature_names, row_vals)

import pandas as pd

list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
df = pd.DataFrame(list_of_dicts)
print(df.head())

def read_large_file(file_object):
    while True:
        data = file_object.readline()
        if not data:
            break
        yield data
        
with open('world_dev_ind.csv') as file:
    gen_file = read_large_file(file)
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))
    
counts_dict = {}

with open('world_dev_ind.csv') as file:
    for line in read_large_file(file):
        row = line.split(',')
        first_col = row[0]
        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1        
print(counts_dict)

# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize = 1000)
df_urb_pop = next(urb_pop_reader)
print(df_urb_pop.head())
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)
print(pops_list)

# Code from previous exercise
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
df_urb_pop = next(urb_pop_reader)
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
pops = zip(df_pop_ceb['Total Population'], 
           df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)

# Use list comprehension to create new DataFrame column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list ]

# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)

# Initialize empty DataFrame: data
data = pd.DataFrame()

# Iterate over each DataFrame chunk
for df_urb_pop in urb_pop_reader:
    df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
    pops = zip(df_pop_ceb['Total Population'],
                df_pop_ceb['Urban population (% of total)'])
    pops_list = list(pops)
    df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
    data = data.append(df_pop_ceb)

# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

def plot_pop(filename, country_code):

    urb_pop_reader = pd.read_csv(filename, chunksize=1000)
    data = pd.DataFrame()

    for df_urb_pop in urb_pop_reader:
        df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
        pops = zip(df_pop_ceb['Total Population'],
                    df_pop_ceb['Urban population (% of total)'])
        pops_list = list(pops)

        df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
        data = data.append(df_pop_ceb)
        
    data.plot(kind='scatter', x='Year', y='Total Urban Population')
    plt.show()

# Set the filename: fn
fn = 'ind_pop_data.csv'
plot_pop(fn, 'CEB')
plot_pop(fn, 'ARB')