# Plots by Sebastian
## Plotting genres part of movies pr. year

### Importing libraries

In [45]:
import matplotlib.pyplot as plt 
from data_gen import gen_df
import numpy as np
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

### Generating dataframe from data-file

In [46]:
filename = 'imdb.csv'

df = gen_df(filename)
df['year'] = df['year'].astype(int)
df.head()

Unnamed: 0,index,tid,title,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,...,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western,decade
0,2200,tt0011565,The Penalty (1920),7.6,1095.0,1.5,1920,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
1,502,tt0011841,MÌ_dchenlos (1920),8.1,3134.0,2.416667,1920,0.0,0.0,18.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
2,12832,tt0011865,Irrwege einer Ehe (1920),8.2,1042.0,1.5,1920,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
3,12148,tt0010323,Das Cabinet des Dr. Caligari (1920),8.1,29379.0,1.3,1920,0.0,0.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920s
4,12368,tt0011071,Buster Keaton als StrÌ_fling (1920),7.1,1474.0,0.333333,1920,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1920s


### Plotting genres as part of total movies at the year
These are generated as:
$$
\text{Genre share} = \frac{\text{Total movies in genre in year}}{\text{Total movies in year}}
$$

In [49]:
# Generate genre list
genre_list = list(df)[13:41]

In [50]:
def _plot_1(df,genre):
    
    df['count'] = 1
    df['movies_year'] = df.groupby('year')['count'].transform(lambda x: x.sum())  

    # Plot year sum of different genres
    for i in genre:
    #i = genre

        df[f'{i}_year'] = df.groupby('year')[i].transform(lambda x: x.sum())
        df[f'{i}_share'] = df[f'{i}_year']/df['movies_year']

        y_share = df.groupby('year')[f'{i}_share'].first()

        #ax = plt.subplot(2,2,j)
        y_share.plot(kind='line', sharex='col', sharey='row')

        plt.xlabel('year')
        plt.ylabel('share of movies')
        plt.title(i)
        
    #plt.show()
    
def plot_1(df):
    
    widgets.interact(_plot_1,
                    df = widgets.fixed(df),
                    genre = widgets.SelectMultiple(
                        options = genre_list,
                        description = 'Genres',
                        disabled = False))
    
plot_1(df)


interactive(children=(SelectMultiple(description='Genres', options=('Action', 'Adult', 'Adventure', 'Animation…

## Listing top/bottom 10 movies and finding their original titles from IMDb
To do this we sort the datasets on imdbRating and select top 10 an bottom 10.
From this we pull out the id's, and use these to find original titles from IMDb. This is donw by using the request-module, which acceses the IMDb-website and finds the title.

In [53]:

# Condition on minimum number of ratings
I = df['ratingCount'] >= 10000

top = df.sort_values('imdbRating', ascending=False)[I];
bottom = df.sort_values('imdbRating', ascending=True)[I];

# Get English titles 
top_id = top.tid.tolist()[0:10]
bottom_id = bottom.tid.tolist()[0:10]
id_list = top_id + bottom_id

eng_names = []

for i,id in enumerate(id_list):
    test = requests.get('https://www.imdb.com/title/' + id)
    soup = bs(test.text,'html.parser')
    if soup.find('div',class_='originalTitle') != None:
        eng_names.append(soup.find('div',class_='originalTitle').text)
    else:
        eng_names.append(soup.find('h1').text)

#test = df.merge(df_merge,how='right',on='tid')
#print(test.head())



  """
  


In [58]:
df_merge_1 = pd.DataFrame(i for i in eng_names)
df_merge_2 = pd.DataFrame(i for i in id_list)
df_merge = df_merge_1.merge(df_merge_2, left_index=True, right_index=True)
df_merge = df_merge.rename(columns={'0_x': 'eng_title', '0_y': 'tid'})

top_bottom = df.merge(df_merge, how='right', on='tid').sort_values('imdbRating', ascending=False)
top_bottom = top_bottom[['title','eng_title','imdbRating']].reset_index()
top_bottom.drop(columns='index', inplace=True)

Unnamed: 0,title,eng_title,imdbRating
0,Die Verurteilten (1994),The Shawshank Redemption (original title),9.3
1,Der Pate (1972),The Godfather (original title),9.2
2,Der Pate 2 (1974),The Godfather: Part II (original title),9.1
3,Zwei glorreiche Halunken (1966),"Il buono, il brutto, il cattivo (original title)",9.0
4,The Godfather Trilogy: 1901-1980 (Video 1992),The Godfather Trilogy: 1901-1980 (1992),9.0
5,Pulp Fiction (1994),Pulp Fiction (1994),9.0
6,The Dark Knight (2008),The Dark Knight (2008),9.0
7,Fight Club (1999),Fight Club (1999),8.9
8,Der Herr der Ringe - Die GefÌ_hrten (2001),The Lord of the Rings: The Fellowship of the R...,8.9
9,Der Herr der Ringe - Die RÌ_ckkehr des KÌ¦nigs...,The Lord of the Rings: The Return of the King ...,8.9


In [72]:
name_list = top_bottom.eng_title.tolist()
rating_list = top_bottom.imdbRating.tolist()

text = 'The top movies are'
for j,i in enumerate(name_list):
    if j < 10:
        text += '\n'
        text += f'{i:6} is number {j+1} with a rating of {rating_list[j]}'
print(text)
    


The top movies are
The Shawshank Redemption (original title) is number 1 with a rating of 9.3
The Godfather (original title) is number 2 with a rating of 9.2
The Godfather: Part II (original title) is number 3 with a rating of 9.1
Il buono, il brutto, il cattivo (original title) is number 4 with a rating of 9.0
The Godfather Trilogy: 1901-1980 (1992)  is number 5 with a rating of 9.0
Pulp Fiction (1994)  is number 6 with a rating of 9.0
The Dark Knight (2008)  is number 7 with a rating of 9.0
Fight Club (1999)  is number 8 with a rating of 8.9
The Lord of the Rings: The Fellowship of the Ring (original title) is number 9 with a rating of 8.9
The Lord of the Rings: The Return of the King (original title) is number 10 with a rating of 8.9


In [17]:
d = {'col1': [1,2,3]}
e = {'col2': [1,2]}

test_1 = pd.DataFrame(data=d)
test_2 = pd.DataFrame(data=e)

test_1.merge(test_2, left_on='col1', right_on='col2', how='left')

Unnamed: 0,col1,col2
0,1,1.0
1,2,2.0
2,3,
