# STAT29000 Project 4 Examples

In [1]:
# Important Note
# You will need to make sure that the provided 'media' folder/package that was provided for you is 
# in the same location as this examples notebook.

from media.rottentomatoes import datasets as ds
from media.rottentomatoes.utilities import search

print(search.__doc__)

Successfully loaded the media library.

    Return dataframe where all of the provided words 
    are found in the movie title in the pre-loaded movies 
    dataset. Case insensitive.
    


In [4]:
# Lets write a function that takes a pandas Series and returns any potential websites in the Series
# Lets say our method involves splitting a word up by ".". Note that this isn't a good method.
# We just want to demonstrate how functions can have functions.
def get_websites(series):
    """
    This is called a docstring. Its highly recommended that you write
    docstrings for your functions to help describe what the function does. 
    
    You can access docstrings of a function by print(function.__doc__).
    
    Sometimes docstrings are used to automatically create great documentation
    websites. Take a look:
    
    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/_from_model.py
    
    and the associated documentation
    
    https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
    
    Google even published what it believes is the "best" docstring style is: 
    
    https://google.github.io/styleguide/pyguide.html
    """
    
    # Sometimes an internal "helper" function can be useful and make a function easier to read
    # ... I'm not claiming this is the case here, but it could be useful to see one in action
    
    # when a function starts with a single leading _, it is an "internal use" indicator,
    # and if you import functions from a module using the *, these functions aren't imported.
    def _my_helper_function(publication_name: str):
        return publication_name.split(".")
    
    # here, is another use of the _ character. normally, if we are looping through a Pandas
    # series using the iteritems() method, we get both an index AND the element. For example,
    # the loops commented out below, don't ever use the index (idx). 
    
#     for idx, item in series.iteritems():
#         our_list = _my_helper_function(item)
#         if len(our_list) > 1:
#             print(item)
            
    # Here, the _ behaves just like (idx) does in the code above (you can even print(_) 
    # in the loop if you wanted to). The only difference is it is a clear indicator
    # to the user that whatever is usually there, we don't really use.
    list_to_return = []
    for _, item in series.iteritems():
        our_list = _my_helper_function(item)
        if len(our_list) > 1:
            list_to_return.append(item)
    
    return list_to_return

websites = get_websites(ds.reviews['critic_publication'])
print(websites[:10])

['Cinemaphile.org', 'ComingSoon.net', 'Movies.com', 'TheMovieReport.com', 'rec.arts.movies.reviews', 'I.E. Weekly', 'MovieCrypt.com', 'jackiekcooper.com', 'smh.com.au', 'Movies.com']


In [5]:
import string

# super easy way to get punctuation characters
print(string.punctuation)

# you can even do this
print(list(string.punctuation))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [21]:
# zip is a pretty useful function to learn
# https://docs.python.org/3/library/functions.html#zip

print(list(zip(list('999'), list('123'))))

example = [int(nine)*int(ok) for nine, ok in zip(list('999'), list('123'))]

print(example)

[('9', '1'), ('9', '2'), ('9', '3')]
[9, 18, 27]


In [11]:
# here is an example using the `search` function that was provided to you
from media.rottentomatoes.utilities import search

print(search("harry").head(2))

# here is an example using the updated `get_reviews` function provided to you
from media.rottentomatoes.reviews import get_reviews

my_corpus, _ = get_reviews("harry_potter_and_the_order_of_the_phoenix", 50)
my_document = my_corpus[0]

print(my_corpus[:10])
print(my_document)

                                       rt_id  \
5240                             dirty_harry   
5510  dumb_and_dumberer_when_harry_met_lloyd   

                                  movie_title  \
5240                              Dirty Harry   
5510  Dumb and Dumberer: When Harry Met Lloyd   

                                             movie_info  \
5240  "You've got to ask yourself a question: 'do I ...   
5510  Set in 1986, when mentally challenged best fri...   

                                      critics_consensus  \
5240  As tough and taciturn as its no-nonsense hero,...   
5510  This lame prequel induces more groans than lau...   

                                       poster_image_url rating  \
5240  https://resizing.flixster.com/PoID1a1hMYzCTrnc...      R   
5510  https://resizing.flixster.com/VNT_d4bM66IkCo4N...  PG-13   

                                    genre    directors  \
5240  Classics, Drama, Mystery & Suspense   Don Siegel   
5510                               C

In [13]:
# Dont forget enumerate, a useful function.
for index, value in enumerate(list(string.ascii_lowercase)):
    if index % 2 == 0:
        print(value)
    else:
        print(value.upper())
        
# Join is another function you should remember
print('-'.join(string.ascii_lowercase))
print('-'.join(tuple(letter.upper() if idx % 2 ==0 else letter.lower() for idx, letter in enumerate(string.ascii_lowercase))))

a
B
c
D
e
F
g
H
i
J
k
L
m
N
o
P
q
R
s
T
u
V
w
X
y
Z
a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z
A-b-C-d-E-f-G-h-I-j-K-l-M-n-O-p-Q-r-S-t-U-v-W-x-Y-z


In [27]:
document = "This is an example of a document. blah blah blah"

# here are some examples using popular string methods
print(document.upper())
print(document.lower())
print(document.split())
print(document.split("of")) # you can split on anthing
print(document.count("blah"))

# you can "chain" these methods too
print(document.split().count("blah"))
print(document.upper().split())

THIS IS AN EXAMPLE OF A DOCUMENT. BLAH BLAH BLAH
this is an example of a document. blah blah blah
['This', 'is', 'an', 'example', 'of', 'a', 'document.', 'blah', 'blah', 'blah']
['This is an example ', ' a document. blah blah blah']
3
3
['THIS', 'IS', 'AN', 'EXAMPLE', 'OF', 'A', 'DOCUMENT.', 'BLAH', 'BLAH', 'BLAH']
