In [None]:
# Run this cell to set up the notebook, but please don't change it.
import numpy as np
import math
import datascience
from datascience import *

# These lines set up the plotting functionality and formatting.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")

In [None]:
movies = Table.read_table('movies.csv')

In [None]:
title_index = movies.index_by('Title')
def row_for_title(title):
    """Return the row for a title, similar to the following expression (but faster)
    
    movies.where('Title', title).row(0)
    """
    return title_index.get(title)[0]

row_for_title('toy story')

In [None]:
# Here we have defined the proportion of our data
# that we want to designate for training as 17/20ths
# of our total dataset.  3/20ths of the data is
# reserved for testing.

training_proportion = 17/20

num_movies = movies.num_rows
num_train = int(num_movies * training_proportion)
num_test = num_movies - num_train

train_movies = movies.take(np.arange(num_train))
test_movies = movies.take(np.arange(num_train, num_movies))

print("Training: ",   train_movies.num_rows, ";",
      "Test: ",       test_movies.num_rows)

#### Use Horizontal Bar Chart to Visualize Proportion of Comedy in Training Set and Test Set

In [None]:
# Step 1 :
#  Create a function to return the proportion of comedy movies in a dataset
# Option 1
def comedy_proportion(table):
    """Return the proportion of movies in a table that have the comedy genre."""
    return table.where('Genre', are.equal_to('comedy')).num_rows / table.num_rows

# Option 2
#def comedy_proportion(table):
#    """Return the proportion of movies in a table that have the comedy genre."""
#    total_movies = table.num_rows
#    comedy_movies = table.where('Genre', are.equal_to('comedy'))
#    return comedy_movies / table.total_movies

# Step 2:
#  Create a table with 2 columns (Dataset and Proportion of Comedy)
# Option 1
datasets = make_array('Training', 'Test')
prop_comedy = make_array(comedy_proportion(train_movies), comedy_proportion(test_movies))
Table().with_columns(
        'Dataset', datasets,
        'Proportion of Comedy', prop_comedy).barh('Dataset')

# Option 2
#train_comedy = comedy_proportion(train_movies)
#test_comedy = comedy_proportion(test_movies)
#comedy_prop_tbl = Table().with_columns(
#                    'Dataset', make_array('Training', 'Test'),
#                    'Proportion of Comedy', make_array(train_comedy, test_comedy)
#                    )
#comedy_prop_tbl.barh('Dataset')

#### Compute the distance between Monty Python and the Holy Grail and The Godfather, using the water and feel features only

In [None]:
python = row_for_title("monty python and the holy grail") 
godfather = row_for_title("the godfather")

# Option 1
one_distance = ((python.item("water") - godfather.item("water"))**2 + (python.item("feel") - godfather.item("feel"))**2)**0.5

# Option 2
#python_water = python.item("water")  # x1
#python_feel = python.item("feel")    # y1
#godfather_water = godfather.item("water")   # x2
#godfather_feel = godfather.item("feel")     # y2
#one_distance = ((python_water - godfather_water)**2 + (python_feel - godfather_feel)**2)**0.5

one_distance

#### Create a function to computes the distance between any two movies, using two features

In [None]:
# This function will take 2 movie names and 2 features for parameters.
#   It will call row_for_title() function to get the row object of a movie.
#   The row object contains the 5000 features of the movie.
#   row_for_title() will be called twice to get the row objects for the 2 movies.
#   Then the feature values can be retrieved from the row objects
#   for computing the distance of the 2 movies in the context of the 2 features\.
def distance_two_features(title0, title1, x_feature, y_feature):
    """Compute the distance between two movies with titles title0 and title1.
    
    Only the features named x_feature and y_feature are used when computing the distance.
    """
    row0 = row_for_title(title0)
    row1 = row_for_title(title1)
    return ((row0.item(x_feature) - row1.item(x_feature))**2 + (row0.item(y_feature) - row1.item(y_feature))**2)**0.5

#### Create a function to computes the distance between a movie and "monty python and the holy grail", using "water" and "feel" features

In [None]:
# This function will take 1 movie title for parameter.
# It will call distance_two_features() by passing in a movie name, 
#   "monty python and the holy grail", "water" and "feel" as arguments.
# Then, return the distance between a movie and "monty python and the holy grail",
#   in context of "water" and "feel" features.
def distance_from_python(title):
    """The distance between the given movie and "monty python and the holy grail", 
    based on the features "water" and "feel".
    
    This function takes a single argument:
      title: A string, the name of a movie.
    """
    
    return distance_two_features(title, "monty python and the holy grail", "water", "feel")

#### Find the names and genres of the 5 movies in the training set (train_movies table above) closest to "monty python and the holy grail", using "water" and "feel" features

In [None]:
# Option 1
# Step 1:
#  Select 4 columns from the train_movies table - "Title", "Genre", "water", "feel" 
water_feel = train_movies.select("Title", "Genre", "water", "feel")

# Step 2:
#  Use apply method to call distance_from_python() function to 
#  return an array distance between "monty python and the holy grail" and 
#  other movies in the train_movies table
distance_array = water_feel.apply(distance_from_python, 'Title')

# Step 3:
#  Append a new column with distance_array to water_feel table 
water_feel_distance = water_feel.with_column('distance from python', distance_array)

# Step 4:
#  Sort water_feel_distance by column 'distance from python' in ascending order and 
#  take the first 5 rows and name/assign it to close_movies
close_movies = water_feel_distance.sort('distance from python').take(np.arange(5))

# Option 2
#water_feel = train_movies.select("Title", "Genre", "water", "feel")
#close_movies = water_feel.with_column('distance from python', water_feel.apply(distance_from_python, 'Title'))\
#                       .sort('distance from python')\
#                       .take(np.arange(5))

close_movies

#### Predict the genre of "monty python and the holy grail" based on the closest movies

In [None]:
# Create a function to look for the majority of genre in the close_movies table
# Step 1: group the table by class (genre in this case)
# Step 2: sort by count column in descending order
# Step 3: get the first item in the genre column

def most_common(label, table):
    """The most common element in a column of a table.
    
    This function takes two arguments:
      label: The label of a column, a string.
      table: A table.
     
    It returns the most common value in the label column of the table.
    In case of a tie, it returns any one of the most common values.    
    """
    return table.group(label).sort('count', descending=True).column(label).item(0)

In [None]:
most_common('Genre', close_movies)

In [None]:
# Just run this cell to define fast_distances.

def fast_distances(test_row, train_table):
    """Return an array of the distances between test_row and each row in train_table.

    Takes 2 arguments:
      test_row: A row of a table containing features of one
        test movie (e.g., test_my_features.row(0)).
      train_table: A table of features (for example, the whole
        table train_my_features)."""
    assert train_table.num_columns < 50, "Make sure you're not using all the features of the movies table."
    assert type(test_row) != datascience.tables.Table, "Make sure you are passing in a row object to fast_distances."
    assert len(test_row) == len(train_table.row(0)), "Make sure the length of test row is the same as the length of a row in train_table."
    counts_matrix = np.asmatrix(train_table.columns).transpose()
    diff = np.tile(np.array(list(test_row)), [counts_matrix.shape[0], 1]) - counts_matrix
    np.random.seed(0) # For tie breaking purposes
    distances = np.squeeze(np.asarray(np.sqrt(np.square(diff).sum(1))))
    eps = np.random.uniform(size=distances.shape)*1e-10 #Noise for tie break
    distances = distances + eps
    return distances

#### Make an array of at least 10 common words that you think might let you distinguish between comedy and thriller movies and assign the array to my_features

In [None]:
my_features = make_array(... put your choice of ten words here ...)


train_my_features = train_movies.select(my_features)
test_my_features = test_movies.select(my_features)

#### How to use fast_distances() function

fast_distances() function:
1. takes 2 parameters - a row object from test_my_features and a table - train_my_features table
2. computer the distance of the row object against each row object in the train_my_features table
3. return an array of resulting distances

In [None]:
# Example of using fast_distances() function

distances = fast_distances(test_my_features.row(0), train_my_features)
distances