In [2]:
import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv('imdb-top-1000.csv')

In [4]:
movies.head()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0


In [5]:
genres = movies.groupby('Genre')

In [6]:
# Applying builtin aggregation fuctions on groupby objects
genres.std()

ValueError: could not convert string to float: 'The Shawshank Redemption'

In [None]:
# find the top 3 genres by total earning
movies.groupby('Genre').sum()['Gross'].sort_values(ascending=False).head(3)

In [None]:
movies.groupby('Genre')['Gross'].sum().sort_values(ascending=False).head(3)

In [None]:
# find the genre with highest avg IMDB rating
movies.groupby('Genre')['IMDB_Rating'].mean().sort_values(ascending=False).head(1)

In [None]:
# find director with most popularity
movies.groupby('Director')['No_of_Votes'].sum().sort_values(ascending=False).head(1)

In [None]:
# find the highest rated movie of each genre
# movies.groupby('Genre')['IMDB_Rating'].max()

In [None]:
# find number of movies done by each actor
# movies['Star1'].value_counts()

movies.groupby('Star1')['Series_Title'].count().sort_values(ascending=False)

In [None]:
# GroupBy Attributes and Methods
# find total number of groups -> len
# find items in each group -> size
# first()/last() -> nth item
# get_group -> vs filtering
# groups
# describe
# sample
# nunique

In [None]:
len(movies.groupby('Genre'))

In [None]:
movies['Genre'].nunique()

In [None]:
movies.groupby('Genre').size()

In [None]:
genres = movies.groupby('Genre')
# genres.first()
# genres.last()
genres.nth(6)

In [None]:
movies['Genre'].value_counts()

In [None]:
genres.get_group('Fantasy')

movies[movies['Genre'] == 'Fantasy']

In [None]:
genres.groups

In [None]:
genres.describe()

In [None]:
genres.sample(2, replace=True)

In [None]:
genres.nunique()

In [None]:
# agg method
# passing dict
genres.agg(
    {
        'Runtime': 'mean',
        'IMDB_Rating': 'mean',
        'No_of_Votes': 'sum',
        'Gross': 'sum',
        'Metascore': 'min'
    }
)

In [None]:
# passing list
genres.agg(['min', 'max', 'mean', 'sum'])

In [None]:
# Adding both the syntax
genres.agg(
    {
        'Runtime': ['min', 'mean'],
        'IMDB_Rating': 'mean',
        'No_of_Votes': ['sum', 'max'],
        'Gross': 'sum',
        'Metascore': 'min'
    }
)

In [None]:
# looping on groups
df = pd.DataFrame(columns=movies.columns)
for group, data in genres:
    df = df.append(data[data['IMDB_Rating'] == data['IMDB_Rating'].max()])

df

In [None]:
# split (apply) combine
# apply -> builtin function

genres.apply(min)

In [None]:
# find number of movies starting with A for each group

def foo(group):
    return group['Series_Title'].str.startswith('A').sum()


In [None]:
genres.apply(foo)

In [None]:
# find ranking of each movie in the group according to IMDB score

def rank_movie(group):
    group['genre_rank'] = group['IMDB_Rating'].rank(ascending=False)
    return group

In [None]:
genres.apply(rank_movie)

In [None]:
# find normalized IMDB rating group wise

def normal(group):
    group['norm_rating'] = (group['IMDB_Rating'] - group['IMDB_Rating'].min()) / (
                group['IMDB_Rating'].max() - group['IMDB_Rating'].min())
    return group


genres.apply(normal)

In [None]:
# groupby on multiple cols
duo = movies.groupby(['Director', 'Star1'])
duo
# size
duo.size()
# get_group
duo.get_group(('Aamir Khan', 'Amole Gupte'))

In [None]:
# find the most earning actor->director combo
duo['Gross'].sum().sort_values(ascending=False).head(1)

In [7]:
# find the best(in-terms of metascore(avg)) actor->genre combo
movies.groupby(['Star1', 'Genre'])['Metascore'].mean().reset_index().sort_values('Metascore', ascending=False).head(1)

Unnamed: 0,Star1,Genre,Metascore
230,Ellar Coltrane,Drama,100.0


In [8]:
# agg on multiple groupby
duo.agg(['min', 'max', 'mean'])

NameError: name 'duo' is not defined

### Excercise

In [9]:
ipl = pd.read_csv('/content/deliveries.csv')
ipl.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/deliveries.csv'

In [None]:
ipl.shape

In [10]:
# find the top 10 batsman in terms of runs
ipl.groupby('batsman')['batsman_runs'].sum().sort_values(ascending=False).head(10)

NameError: name 'ipl' is not defined

In [11]:
# find the batsman with max no of sixes
six = ipl[ipl['batsman_runs'] == 6]

six.groupby('batsman')['batsman'].count().sort_values(ascending=False).head(1).index[0]

NameError: name 'ipl' is not defined

In [12]:
# find batsman with most number of 4's and 6's in last 5 overs
temp_df = ipl[ipl['over'] > 15]
temp_df = temp_df[(temp_df['batsman_runs'] == 4) | (temp_df['batsman_runs'] == 6)]
temp_df.groupby('batsman')['batsman'].count().sort_values(ascending=False).head(1).index[0]

NameError: name 'ipl' is not defined

In [13]:
# find V Kohli's record against all teams
temp_df = ipl[ipl['batsman'] == 'V Kohli']

temp_df.groupby('bowling_team')['batsman_runs'].sum().reset_index()

NameError: name 'ipl' is not defined

In [14]:
# Create a function that can return the highest score of any batsman
temp_df = ipl[ipl['batsman'] == 'V Kohli']
temp_df.groupby('match_id')['batsman_runs'].sum().sort_values(ascending=False).head(1).values[0]

NameError: name 'ipl' is not defined

In [15]:
def highest(batsman):
    temp_df = ipl[ipl['batsman'] == batsman]
    return temp_df.groupby('match_id')['batsman_runs'].sum().sort_values(ascending=False).head(1).values[0]


In [16]:
highest('DA Warner')

NameError: name 'ipl' is not defined