# Week 16 Review

In [2]:
import numpy as np
import os 
import pandas as pd

## Chapter 6 Data Loading, Storage, and File Formats

- Read CSV files with `pd.read_csv()`
- Text format vs. binary format
- Interacting with databases

In [39]:
text = """# Information about some diamonds.
   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     I     SI2   63.3   58.0    335  4.34  4.35  2.75
"""
if not os.path.isdir("Data"):
    os.mkdir("Data")

with open('Data/Week16_01.txt', 'w') as file:
    file.write(text)

In [40]:
df = pd.read_csv('Data/Week16_01.txt', skiprows=[0, 2], sep='\s+')
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,I,SI2,63.3,58.0,335,4.34,4.35,2.75


In [42]:
df.groupby('color')['price'].max().to_frame("Highest Price")

Unnamed: 0_level_0,Highest Price
color,Unnamed: 1_level_1
E,327
I,335


## Chapter 7: Data Wrangling: Clean, Transform, Merge, Reshape

- Reshaping
    - Use `reshape` on a numpy array
    - Use `unstack` with hierarchical indexing
- Data Transformation
    - Removing duplicates
    - `map`
    - `apply`
- String Manipulation
    - `strip()`
    - `split()`
    - `join()`
    - `find()`, `replace()`
    - Regular expressions
- Handling Missing Values
    - Mean/median imputation
    - Hot-deck imputation
    - Turn missing values to an indicator
- Merging Data Sets
    - `merge()`
    - `concatenate()`
    - `vstack()`, `hstack()`

As a exericse, let's revisit the `ml-latest-small` data set from [MovieLens](https://grouplens.org/datasets/movielens/).

In [9]:
import os
import urllib.request
import zipfile

def get_movielens(file_path, file_name, delete_zip_file=False):
    url = "http://files.grouplens.org/datasets/movielens/" + file_name
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    if not os.path.exists(file_path + file_name):
        urllib.request.urlretrieve(url, file_path + file_name)
        print("File", file_name, "downloaded.")
    with zipfile.ZipFile(file_path + file_name, "r") as f:
        f.extractall(file_path)
        print("Files extracted:")
        print(f.printdir())
    if delete_zip_file:
        os.remove(file_path + file_name)
        
get_movielens("Data/", "ml-latest-small.zip")

Files extracted:
File Name                                             Modified             Size
ml-latest-small/                               2018-09-26 15:50:12            0
ml-latest-small/links.csv                      2018-09-26 15:50:10       197979
ml-latest-small/tags.csv                       2018-09-26 15:49:40       118660
ml-latest-small/ratings.csv                    2018-09-26 15:49:38      2483723
ml-latest-small/README.txt                     2018-09-26 15:50:12         8342
ml-latest-small/movies.csv                     2018-09-26 15:49:56       494431
None


In [10]:
movies = pd.read_csv('Data/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
ratings = pd.read_csv('Data/ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [16]:
# Ex: Merge movies.csv with ratings.csv

temp = pd.merge(ratings, movies, on="movieId", how="left")
temp.shape

(100836, 6)

In [17]:
temp = pd.merge(ratings, movies, on="movieId")
temp.shape

(100836, 6)

In [20]:
# Ex: Create a list of comedy movies

filter1 = movies['genres'].str.contains('Comedy')
movies[filter1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
9734,193571,Silver Spoon (2014),Comedy|Drama
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy


## Chapter 9: Data Aggregation and Group Operations
- GroupBy Machanics
    - Grouping with columns
    - Iterating over groups
    - Grouping with dictionaries
    - Grouping with functions
- Data Aggregation
    - Built-in functions
    - User-defined functions

In [27]:
# Find the 10 movies with the largest amount of ratings.

# merge movies with ratings
combo = pd.merge(ratings, movies, on="movieId")
# combo.head()
num_ratings = combo.groupby('title').size().to_frame("Number of Ratings")

num_ratings.sort_values('Number of Ratings', ascending=False).head(10)

Unnamed: 0_level_0,Number of Ratings
title,Unnamed: 1_level_1
Forrest Gump (1994),329
"Shawshank Redemption, The (1994)",317
Pulp Fiction (1994),307
"Silence of the Lambs, The (1991)",279
"Matrix, The (1999)",278
Star Wars: Episode IV - A New Hope (1977),251
Jurassic Park (1993),238
Braveheart (1995),237
Terminator 2: Judgment Day (1991),224
Schindler's List (1993),220


In [31]:
# Find the 10 comedy movies with the highest average ratings.

combo_comedy = combo[combo['genres'].str.contains('Comedy')]

comedy_avg = combo_comedy.groupby('title')['rating'].mean().to_frame("Average Rating")

comedy_avg.sort_values('Average Rating', ascending=False)

Unnamed: 0_level_0,Average Rating
title,Unnamed: 1_level_1
George Carlin: Back in Town (1996),5.0
Hollywood Chainsaw Hookers (1988),5.0
Scooby-Doo! and the Loch Ness Monster (2004),5.0
Scooby-Doo! and the Samurai Sword (2009),5.0
What Men Talk About (2010),5.0
...,...
Hard Ticket to Hawaii (1987),0.5
The Emoji Movie (2017),0.5
Arthur Christmas (2011),0.5
"Follow Me, Boys! (1966)",0.5


In [38]:
# How to remove those movies with a small number of ratings 
# (because the average rating of these movies could be biased)

df = pd.merge(comedy_avg, num_ratings, how='left', left_index=True,
              right_index=True)
df = df.sort_values('Average Rating', ascending=False)
df[df['Number of Ratings'] > 20].head(10)

Unnamed: 0_level_0,Average Rating,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Philadelphia Story, The (1940)",4.310345,29
Harold and Maude (1971),4.288462,26
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),4.268041,97
"Princess Bride, The (1987)",4.232394,142
Pulp Fiction (1994),4.197068,307
"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",4.183333,120
Forrest Gump (1994),4.164134,329
Monty Python and the Holy Grail (1975),4.161765,136
In Bruges (2008),4.158537,41
Snatch (2000),4.155914,93


## Chapter 10: Time Series
- Converting Between Strings and `datetime`
- Handling Data with `datetime` Indexing
- Generating Date Ranges
- Shifting Data