#### 1. Import libraries and load dataset

In [1]:
import pandas as pd
dataset_path = 'IMDB-Movie-Data.csv'

# Read data from .csv file
data = pd.read_csv(dataset_path)

# Read data with specified explicit index.
# We will use this later in our analysis
data_indexed = pd.read_csv(dataset_path, index_col="Title")

In [None]:
data_indexed

#### 2. View the data

In [None]:
data.head()

#### 3. Understand some basic information about the data

In [None]:
data.info()

In [None]:
data.describe()

#### 4. Data Selection – Indexing and Slicing data

In [None]:
genre = data['Genre']
genre

In [None]:
genre = data[['Genre']]
genre

In [25]:
columns = data[['Title','Genre','Actors','Director','Rating']]
columns

Unnamed: 0,Title,Genre,Actors,Director,Rating
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi","Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,8.1
1,Prometheus,"Adventure,Mystery,Sci-Fi","Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,7.0
2,Split,"Horror,Thriller","James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,7.3
3,Sing,"Animation,Comedy,Family","Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet,7.2
4,Suicide Squad,"Action,Adventure,Fantasy","Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,6.2
...,...,...,...,...,...
995,Secret in Their Eyes,"Crime,Drama,Mystery","Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",Billy Ray,6.2
996,Hostel: Part II,Horror,"Lauren German, Heather Matarazzo, Bijou Philli...",Eli Roth,5.5
997,Step Up 2: The Streets,"Drama,Music,Romance","Robert Hoffman, Briana Evigan, Cassie Ventura,...",Jon M. Chu,6.2
998,Search Party,"Adventure,Comedy","Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",Scot Armstrong,5.6


In [None]:
data.iloc[10:15][['Title','Rating','Revenue (Millions)']]

#### 5. Data Selection – Based on Conditional filtering

In [None]:
data[((data['Year'] >= 2010) & (data['Year'] <= 2015)) & (data['Rating'] < 6.0) & (data['Revenue (Millions)'] > data['Revenue (Millions)'].quantile(0.95))]

#### 6. Groupby operations

In [None]:
data.groupby('Director')[['Rating']].mean().head()

#### 7. Sorting operations

In [None]:
x = data.groupby('Director')[['Rating']].mean()
x.sort_values(['Rating'], ascending=False).head()

#### 8. View missing values

In [None]:
data.isnull().sum()

#### 9. Deal with missing values - Deleting

In [None]:
data.drop(['Metascore'], axis=1).head()

In [None]:
data.dropna()

#### 10. Dealing with missing values - Filling

In [None]:
revenue_mean = data_indexed ['Revenue (Millions)']. mean ()
print ("The mean revenue is: ", revenue_mean )

# We can fill the null values with this mean revenue
data_indexed ['Revenue (Millions)']. fillna (revenue_mean , inplace = True)

#### 11. Applying functions

In [23]:
# Classify movies based on ratings
def rating_group(rating):
    if rating >= 7.5:
        return 'Good'
    elif rating >= 6.0:
        return 'Average'
    else:
        return 'Bad'


data['Rating_category'] = data['Rating'].apply(rating_group)
data[['Title','Director','Rating','Rating_category']].head(5)

Unnamed: 0,Title,Director,Rating,Rating_category
0,Guardians of the Galaxy,James Gunn,8.1,Good
1,Prometheus,Ridley Scott,7.0,Average
2,Split,M. Night Shyamalan,7.3,Average
3,Sing,Christophe Lourdelet,7.2,Average
4,Suicide Squad,David Ayer,6.2,Average
