### Prepping Data Challenge:  Trilogy (week 38)

### Requirements
- Input the data
- Split out the Number in Series field into Film Order and Total Films in Series
- Work out the average rating for each trilogy
- Work out the highest ranking for each trilogy
- Rank the trilogies based on the average rating and use the highest ranking metric to break ties (make sure you haven't rounded the numeric fields yet!)
- Remove the word trilogy from the Trilogy field
- Bring the 2 datasets together by the ranking fields
- Output the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data
with pd.ExcelFile('WK38-input.xlsx') as xl:
    top = pd.read_excel(xl,'Top 30 Trilogies')
    film = pd.read_excel(xl, 'Films')

In [3]:
top.head()

Unnamed: 0,Trilogy Ranking,Trilogy
0,1,Lord of the Rings trilogy
1,2,The Godfather trilogy
2,4,Star Wars trilogy
3,3,The Dark Knight trilogy
4,5,Dollars trilogy


In [4]:
film.head(7)

Unnamed: 0,Number in Series,Trilogy Grouping,Title,Rating
0,2/3,06d49632c9dc9bcb62aeaef99612ba6b,The Lord of the Rings: The Two Towers,8.7
1,1/3,06d49632c9dc9bcb62aeaef99612ba6b,The Lord of the Rings: The Fellowship of the Ring,8.8
2,3/3,06d49632c9dc9bcb62aeaef99612ba6b,The Lord of the Rings: The Return of the King,8.9
3,3/3,08985faab9f27113eef8adfc2200ac27,Babel,7.4
4,2/3,08985faab9f27113eef8adfc2200ac27,21 Grams,7.6
5,1/3,08985faab9f27113eef8adfc2200ac27,Love's a Bitch,8.1
6,1/9,13a893bc3f0877d224af0d73de3f0359,Star Wars: Episode I - The Phantom Menace,6.5


In [5]:
#Split out the Number in Series field into Film Order and Total Films in Series
film[['Film Order','Total Films']] = film['Number in Series'].str.split('/', expand=True).astype(int)

In [6]:
#Work out the average rating for each trilogy
film['Trilogy Average'] = film.groupby('Trilogy Grouping')['Rating'].transform('mean')

In [7]:
#Work out the highest ranking for each trilogy
film['Highest rank'] = film.groupby('Trilogy Grouping')['Rating'].transform('max')

In [8]:
#Rank the trilogies based on the average rating and use the highest ranking metric to break ties 
#(make sure you haven't rounded the numeric fields yet!)
film['Trilogy Ranking'] = film.sort_values(['Trilogy Average','Highest rank'], ascending=False)\
                              .groupby(['Trilogy Average','Highest rank'], sort=False)\
                              .ngroup() + 1

In [9]:
film['Trilogy Average'] = film['Trilogy Average'].round(1)

In [10]:
#Remove the word trilogy from the Trilogy field
top['Trilogy'] = top['Trilogy'].str.replace(('trilogy'),'')

In [11]:
#Bring the 2 datasets together by the ranking fields
output = film.merge(top, on = 'Trilogy Ranking', how= 'left').sort_values('Trilogy Ranking')

In [12]:
output = output[['Trilogy Ranking','Trilogy','Trilogy Average','Film Order','Title','Rating','Total Films']]

In [13]:
output.head(10)

Unnamed: 0,Trilogy Ranking,Trilogy,Trilogy Average,Film Order,Title,Rating,Total Films
0,1,Lord of the Rings,8.8,2,The Lord of the Rings: The Two Towers,8.7,3
1,1,Lord of the Rings,8.8,1,The Lord of the Rings: The Fellowship of the Ring,8.8,3
2,1,Lord of the Rings,8.8,3,The Lord of the Rings: The Return of the King,8.9,3
33,2,The Godfather,8.6,3,The Godfather Part III,7.6,3
34,2,The Godfather,8.6,2,The Godfather: Part II,9.0,3
35,2,The Godfather,8.6,1,The Godfather,9.2,3
74,3,The Dark Knight,8.5,2,The Dark Knight,9.0,3
73,3,The Dark Knight,8.5,3,The Dark Knight Rises,8.4,3
72,3,The Dark Knight,8.5,1,Batman Begins,8.2,3
17,4,Star Wars,8.5,5,Star Wars: Episode V - The Empire Strikes Back,8.7,9


In [14]:
#output the data
output.to_csv('wk38-output.csv', index=False)