### Things to analyze:
* Which genres got maximum >= 7 ratings?
* Which genres are most profitable?
* Average duration / genre
* Thrillers and Mysteries produced / year after 2005

In [None]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
data = pd.read_csv('../input/movie_metadata.csv')
data.head()

In [None]:
data['profit'] = data['gross'] - data['budget']

#### Step 1: Finding all the columns

In [None]:
data.columns

#### Step 2: Selecting the columns that are relevant for analysis, and using movie title as index

In [None]:
dt = data[['genres','movie_title','language','title_year','imdb_score','duration','profit']]
dt.head()

In [None]:
dt.shape

#### Step 3: Splitting the 'genres' column for analysis

In [None]:
s = dt['genres'].str.split('|').apply(Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'genres'
del dt['genres']
df = dt.join(s)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['genres'].unique()

In [None]:
len(df['genres'].unique())

## Query 1:Which genres got maximum >= 7 ratings?

#### Selecting movies with rating >= 7 

In [None]:
df1 = df[df['imdb_score']>=7]

In [None]:
df1.head()

In [None]:
df2 = (pd.DataFrame(df1.groupby('genres').movie_title.nunique())).sort_values('movie_title', ascending=False )

In [None]:
df2

In [None]:
df2[['movie_title']].plot.barh(stacked=True, title = 'Genres with >= 7 ratings', figsize=(8, 8));

## Query 2: Which genres are most profitable?

In [None]:
df3 = df[['movie_title', 'profit','genres']]

In [None]:
df3.head()

In [None]:
# Checking for NaN
df3.loc[df3['genres'] == 'News']

In [None]:
df4 = df3.groupby(['genres']).mean()

In [None]:
df4['profit_million'] = df4['profit']/1000000
del df4['profit']

In [None]:
df4.sort_values('profit_million', ascending=False, inplace = True )

In [None]:
df4[['profit_million']].plot.barh(stacked=True, title = 'Genres by profit (US$ million)', figsize=(8, 8));

## Query 3: Average duration / genre

In [None]:
df5 = df[['movie_title', 'duration','genres']]

In [None]:
df5.head()

In [None]:
df6 = df5.groupby(['genres']).mean()
df6['average_duration']  = df6['duration'].round(2)
del df6['duration']
df6.sort_values('average_duration', ascending=False, inplace = True )

In [None]:
df6

In [None]:
df6[['average_duration']].plot.barh(stacked=True, title = 'Average Duration by Genre (minutes)', figsize=(8, 8));

## Query 4: Thrillers and Mysteries produced / year after 2005

In [None]:
df7 = df[['title_year','genres']]

In [None]:
df7.head()

In [None]:
df7.shape

#### Selecting movies, released after 2005

In [None]:
df8 = df7[df7['title_year']>2005]

In [None]:
df8.shape

### Selecting only thrillers

In [None]:
df9 = df8[df8['genres'] == 'Thriller']

In [None]:
df9.shape

In [None]:
df10 = df9.groupby(['title_year']).count()

In [None]:
df10

In [None]:
df10[['genres']].plot.barh(stacked=True, title = 'Thrillers Released (By Year)', figsize=(8, 8));

### Selecting only mysteries

In [None]:
df11 = df8[df8['genres'] == 'Mystery']

In [None]:
df11.shape

In [None]:
df12 = df11.groupby(['title_year']).count()

In [None]:
df12

In [None]:
df10[['genres']].plot.barh(stacked=True, title = 'Mysteries Released (By Year)', figsize=(8, 8));

##### *— Abhishek Chhibber*