<a href="https://colab.research.google.com/github/RishabKr15/CBTC/blob/main/Data_Manipulation_with_Pandas_%5BComplete%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Topics
1. Series and dataframes
2. Setting up the directory : (OS Package)
3. Loading different file formats
4. Basic Inspection of dataset
5. Indexing & Slicing
6. Creating new columns, renaming & droping
6. Handling Missing Values
7. Sorting & Handling Duplicates
8. Aggregating Data
9. Joining Datasets
10. .replace(),.apply()
11. Dealing with Strings, Datetime & Json data in column
12. Reshaping data
13. Caching & Parallelization
14. Writing data

In [None]:
#Import package
import pandas as pd

### Series and Dataframe

In [None]:
# Series: A series is essentially a column (1-D data)
# Data Frame: A DataFrame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns.
# Pandas DataFrame consists of three principal components, the data, rows, and columns.

In [None]:
# Series
s1 = pd.Series([28, 20, 32, 40])
s2 = pd.Series(['John','David','Reva','Grahm'])
print(s1)
print(type(s1))


0    28
1    20
2    32
3    40
dtype: int64
<class 'pandas.core.series.Series'>


In [None]:
# Dataframe
df = pd.DataFrame({'age':s1,'name':s2})
print(df)
print(type(df))

   age   name
0   28   John
1   20  David
2   32   Reva
3   40  Grahm
<class 'pandas.core.frame.DataFrame'>


In [None]:
# Different Ways of creating a dataframe
# List of list
list_of_lists = [['John', 20,'NY'], ['David', 20, 'CL'], ['Reva', 32],['Grahm',40,'NZ']]
df = pd.DataFrame(list_of_lists, columns = ['name','age','city'])
print(df)

    name  age  city
0   John   20    NY
1  David   20    CL
2   Reva   32  None
3  Grahm   40    NZ


In [None]:
# Use Dictionaries --> Most Common way to create dataframe
name_dict = {'name':['John','David','Reva','Grahm'], 'age':[28, 20, 32, 40]}
# Name of column as key and value as records for that specific column
# Create DataFrame
df = pd.DataFrame(name_dict)
print(df)

    name  age
0   John   28
1  David   20
2   Reva   32
3  Grahm   40


In [None]:
# list of dictionaries
list_dict = [{'name':'John','age':28,'city':'NY'},{'name':'David','age':20,'city':'CA'},{'name':'Reva','age':32,'city':'AUS'},{'name':'Grahm','age':40,'city':'NZ'}]

# Create DataFrame
df = pd.DataFrame(list_dict)
print(df)


    name  age city
0   John   28   NY
1  David   20   CA
2   Reva   32  AUS
3  Grahm   40   NZ


### Setting up the directory

In [None]:
# Project data files - https://drive.google.com/drive/folders/1vLMCY81mZ2l_zNRsiK2Trs1DtumITSla?usp=sharing


In [None]:
import os
os.getcwd() # Get the current directory

'/content/sample_data'

In [None]:
!ls # Command to see the folder available

anscombe.json		     california_housing_train.csv  mnist_test.csv	  README.md
california_housing_test.csv  dataframe.xlsx		   mnist_train_small.csv


In [None]:
# Setting up directory helps access the files easily.
os.chdir('/content/sample_data')
os.getcwd()

'/content/sample_data'

In [None]:
# Example of loading the file
# df = pd.read_csv('mnist_test.csv')

In [None]:
# df.head(1)

In [None]:
# Note: Google colab can't access the files or folder present in the google drive unless it it mounted

In [None]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.getcwd()

'/content/sample_data'

In [None]:
# Change the working directory to project folder
os.chdir('/content/drive/MyDrive/M2_Project')
os.getcwd()

'/content/drive/MyDrive/M2_Project'

In [None]:
!ls # List the files

bash.xml  employee.json  Products.xlsx	       read_imdb_data_w.xlsx
cache	  imdb_data.csv  read_imdb_data_w.csv  Superstore.xlsx


In [None]:
os.getcwd()

'/content/drive/MyDrive/M2_Project'

In [None]:
# os.chdir()

In [None]:
# pd.read_csv('/content/drive/MyDrive/A0Z5AGRAGTQK (1).csv')

### Loading different file formats

#### Different types of file
1. CSV
2. Excel
3. JSON (JavaScript Object Notation)
4. XML (Extensible Markup Language)

In [None]:
!ls

sample_data


In [None]:
# Reading CSV file
read_imdb_data = pd.read_csv('imdb_data.csv')
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
!ls

In [None]:
# Read Excel file
# Default first sheet
df = pd.read_excel("Products.xlsx")
print(df)

# Mention Sheet name
df = pd.read_excel("Products.xlsx",sheet_name = 'grocery')
print(df)

In [None]:
# JSON (JavaScript Object Notation)
df = pd.read_json('employee.json')
df

In [None]:
# Reading XML File
df = pd.read_xml('bash.xml')
df

### Basic Inspection of dataset

In [None]:
# shape,head(n),tail(n),columns,describe(),info()

In [None]:
read_imdb_data.shape

(3000, 23)

In [None]:
read_imdb_data.head(1) # First n rows : default 5

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.tail(1) # last n rows : default 5

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2999,3000,,35000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",http://www.abductionthefilm.com/,tt1600195,en,Abduction,A young man sets out to uncover the truth abou...,10.512109,...,9/22/11,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They stole his life. He's taking it back.,Abduction,"[{'id': 591, 'name': 'cia'}, {'id': 822, 'name...","[{'cast_id': 2, 'character': 'Nathan Harper', ...","[{'credit_id': '5391990d0e0a260fb5001629', 'de...",82087155


In [None]:
read_imdb_data.columns # name of columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [None]:
read_imdb_data.describe() # Inspection of continous data

Unnamed: 0,id,budget,popularity,runtime,revenue
count,3000.0,3000.0,3000.0,2998.0,3000.0
mean,1500.5,22531330.0,8.463274,107.856571,66725850.0
std,866.169729,37026090.0,12.104,22.086434,137532300.0
min,1.0,0.0,1e-06,0.0,1.0
25%,750.75,0.0,4.018053,94.0,2379808.0
50%,1500.5,8000000.0,7.374861,104.0,16807070.0
75%,2250.25,29000000.0,10.890983,118.0,68919200.0
max,3000.0,380000000.0,294.337037,338.0,1519558000.0


In [None]:
read_imdb_data.describe(include= 'all') # Analysis of continous & Categorical data

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
count,3000.0,604,3000.0,2993,946,3000,3000,3000,2992,3000.0,...,3000,2998.0,2980,3000,2403,3000,2724,2987,2984,3000.0
unique,,422,,872,941,3000,36,2975,2992,,...,2398,,401,2,2400,2969,2648,2975,2984,
top,,"[{'id': 645, 'name': 'James Bond Collection', ...",,"[{'id': 18, 'name': 'Drama'}]",http://www.transformersmovie.com/,tt2637294,en,Joshua,"When Lou, who has become the ""father of the In...",,...,9/10/10,,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Based on a true story.,The Magnificent Seven,"[{'id': 10183, 'name': 'independent film'}]",[],"[{'credit_id': '59ac067c92514107af02c8c8', 'de...",
freq,,16,,266,4,1,2575,2,1,,...,5,,1817,2996,3,2,27,13,1,
mean,1500.5,,22531330.0,,,,,,,8.463274,...,,107.856571,,,,,,,,66725850.0
std,866.169729,,37026090.0,,,,,,,12.104,...,,22.086434,,,,,,,,137532300.0
min,1.0,,0.0,,,,,,,1e-06,...,,0.0,,,,,,,,1.0
25%,750.75,,0.0,,,,,,,4.018053,...,,94.0,,,,,,,,2379808.0
50%,1500.5,,8000000.0,,,,,,,7.374861,...,,104.0,,,,,,,,16807070.0
75%,2250.25,,29000000.0,,,,,,,10.890983,...,,118.0,,,,,,,,68919200.0


In [None]:
read_imdb_data.info() # Information about dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  604 non-null    object 
 2   budget                 3000 non-null   int64  
 3   genres                 2993 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   2844 non-null   object 
 12  production_countries   2945 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

In [None]:
# Inspection of select few columns
read_imdb_data[['id', 'original_title','runtime','revenue', 'budget', 'genres','imdb_id', 'original_language' ]]

Unnamed: 0,id,original_title,runtime,revenue,budget,genres,imdb_id,original_language
0,1,Hot Tub Time Machine 2,93.0,12314651,14000000,"[{'id': 35, 'name': 'Comedy'}]",tt2637294,en
1,2,The Princess Diaries 2: Royal Engagement,113.0,95149435,40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",tt0368933,en
2,3,Whiplash,105.0,13092000,3300000,"[{'id': 18, 'name': 'Drama'}]",tt2582802,en
3,4,Kahaani,122.0,16000000,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",tt1821480,hi
4,5,마린보이,118.0,3923970,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",tt1380152,ko
...,...,...,...,...,...,...,...,...
2995,2996,Chasers,102.0,1596687,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",tt0109403,en
2996,2997,Vi är bäst!,102.0,180590,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",tt2364975,sv
2997,2998,The Long Kiss Goodnight,120.0,89456761,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",tt0116908,en
2998,2999,Along Came Polly,90.0,171963386,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",tt0343135,en


In [None]:
# Important Points to remember :


### Indexing and Slicing()

In [None]:
# loc(), iloc(), Boolean indexing

In [None]:
# Understanding the index
# list of dictionaries
list_dict = [{'name':'John','age':28},{'name':'David','age':20},{'name':'Reva','age':32},{'name':'Grahm','age':40}]

# Create DataFrame
df = pd.DataFrame(list_dict,index = ['A','B','C','D'])
df

Unnamed: 0,name,age
A,John,28
B,David,20
C,Reva,32
D,Grahm,40


In [None]:
df.loc[['A','C'],['name','age']] # 1st part row subset, 2nd part column

Unnamed: 0,name,age
A,John,28
C,Reva,32


In [None]:
df.columns

Index(['name', 'age'], dtype='object')

In [None]:
df

Unnamed: 0,name,age
A,John,28
B,David,20
C,Reva,32
D,Grahm,40


In [None]:
df

Unnamed: 0,name,age
A,John,28
B,David,20
C,Reva,32
D,Grahm,40


In [None]:
df.iloc[[1,3],[0]] # Column until 2nd index

Unnamed: 0,name
B,David
D,Grahm


In [None]:
# loc(): When Subsetting is done by index and column names
# iloc(): When Subsetting is done by index and column number

# df.loc[10:40, ['name']]
# df.iloc[:4,:1]

In [None]:
# .loc[] --> Column names defined and you want to deal few column
# .iloc[] -->

In [None]:
# loc(): When Subsetting is done by index and column name
read_imdb_data.loc[50:55,['budget','genres','runtime']] # rows and columns

Unnamed: 0,budget,genres,runtime
50,560000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",102.0
51,12000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",160.0
52,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",97.0
53,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",98.0
54,20000000,"[{'id': 28, 'name': 'Action'}]",107.0
55,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",95.0


In [None]:
read_imdb_data.loc[:5,]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,...,8/6/87,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night,,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638


In [None]:
read_imdb_data.loc[3:5,] # till 5 all columns, 3 & 5 is included

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,...,8/6/87,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night,,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638


In [None]:
df

Unnamed: 0,name,age
A,John,28
B,David,20
C,Reva,32
D,Grahm,40


In [None]:
df.index[-3:-1]

Index(['B', 'C'], dtype='object')

In [None]:
df.loc[df.index[-3:-1],['name']]

Unnamed: 0,name
B,David
C,Reva


In [None]:
read_imdb_data.index[-5:]

RangeIndex(start=2995, stop=3000, step=1)

In [None]:
read_imdb_data.loc[read_imdb_data.index[-5:] , ] # last 5 data using loc :; Use .index to use the negative index

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.85327,...,4/22/94,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687
2996,2997,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,3/28/13,102.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,10/11/96,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,1/16/04,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386
2999,3000,,35000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",http://www.abductionthefilm.com/,tt1600195,en,Abduction,A young man sets out to uncover the truth abou...,10.512109,...,9/22/11,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They stole his life. He's taking it back.,Abduction,"[{'id': 591, 'name': 'cia'}, {'id': 822, 'name...","[{'cast_id': 2, 'character': 'Nathan Harper', ...","[{'credit_id': '5391990d0e0a260fb5001629', 'de...",82087155


In [None]:
# -50 to -40 rows using loc for read_imdb_data
# read_imdb_data.loc[read_imdb_data.index[-50:-40] , 'genres' ]

In [None]:
read_imdb_data.loc[[34,56,78] , :] # Only specific rows

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
34,35,"[{'id': 90863, 'name': 'Rush Hour Collection',...",140000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,tt0293564,en,Rush Hour 3,After an attempted assassination on Ambassador...,9.718111,...,8/8/07,91.0,"[{'iso_639_1': 'la', 'name': 'Latin'}, {'iso_6...",Released,The Rush Is On!,Rush Hour 3,"[{'id': 1704, 'name': 'ambassador'}]","[{'cast_id': 2, 'character': 'Det. James Carte...","[{'credit_id': '52fe43fac3a36847f807b5bd', 'de...",258022233
56,57,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.joueuse-lefilm.com/,tt1082009,fr,Joueuse,"In a small Corsican village, the life of Hélèn...",2.94798,...,8/5/09,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Queen to Play,"[{'id': 316, 'name': 'chess'}, {'id': 3797, 'n...","[{'cast_id': 1, 'character': 'H√©l√®ne', 'cred...","[{'credit_id': '52fe4624c3a36847f80eefcf', 'de...",1056938
78,79,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,tt0083064,en,Sharky's Machine,"Sharky gets busted back to working vice, where...",2.633399,...,12/18/81,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"If you mess with a cop like Sharky, you better...",Sharky's Machine,"[{'id': 108, 'name': 'transvestism'}, {'id': 6...","[{'cast_id': 3, 'character': 'Tom Sharky', 'cr...","[{'credit_id': '555507869251411e620004c9', 'de...",35610100


In [None]:
read_imdb_data.loc[read_imdb_data.index[[-3,-1,-2]] , :] # Only specific rows if negative index is used.

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,10/11/96,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761
2999,3000,,35000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",http://www.abductionthefilm.com/,tt1600195,en,Abduction,A young man sets out to uncover the truth abou...,10.512109,...,9/22/11,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They stole his life. He's taking it back.,Abduction,"[{'id': 591, 'name': 'cia'}, {'id': 822, 'name...","[{'cast_id': 2, 'character': 'Nathan Harper', ...","[{'credit_id': '5391990d0e0a260fb5001629', 'de...",82087155
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,1/16/04,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386


In [None]:
read_imdb_data.loc[:10,]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,...,8/6/87,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night,,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638
6,7,,14000000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",http://www.thepossessionmovie.com/,tt0431021,en,The Possession,A young girl buys an antique box at a yard sal...,7.286477,...,8/30/12,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear The Demon That Doesn't Fear God,The Possession,,"[{'cast_id': 23, 'character': 'Clyde', 'credit...","[{'credit_id': '52fe4981c3a368484e12ee29', 'de...",85446075
7,8,,0,"[{'id': 99, 'name': 'Documentary'}]",,tt0391024,en,Control Room,A chronicle which provides a rare window into ...,1.949044,...,1/15/04,84.0,"[{'iso_639_1': 'ar', 'name': 'العربية'}, {'iso...",Released,Different channels. Different truths.,Control Room,"[{'id': 917, 'name': 'journalism'}, {'id': 163...","[{'cast_id': 2, 'character': 'Himself', 'credi...","[{'credit_id': '52fe47a69251416c750a0daf', 'de...",2586511
8,9,"[{'id': 256377, 'name': 'The Muppet Collection...",0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,tt0117110,en,Muppet Treasure Island,After telling the story of Flint's last journe...,6.902423,...,2/16/96,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Set sail for Muppet mayhem!,Muppet Treasure Island,"[{'id': 2041, 'name': 'island'}, {'id': 4418, ...","[{'cast_id': 1, 'character': 'Long John Silver...","[{'credit_id': '52fe43c89251416c7501deb3', 'de...",34327391
9,10,,6000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,tt0310281,en,A Mighty Wind,"In ""A Mighty Wind"", director Christopher Guest...",4.672036,...,4/16/03,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Back together for the first time, again.",A Mighty Wind,"[{'id': 11800, 'name': 'mockumentary'}, {'id':...","[{'cast_id': 24, 'character': 'Jonathan Steinb...","[{'credit_id': '52fe45609251416c750545b3', 'de...",18750246


In [None]:
# iloc : When Subsetting is done by row and column number
read_imdb_data.iloc[ : 10,  : 4] # index 10 & 4 is excluded

Unnamed: 0,id,belongs_to_collection,budget,genres
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]"
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]"
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n..."
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam..."
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '..."
6,7,,14000000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam..."
7,8,,0,"[{'id': 99, 'name': 'Documentary'}]"
8,9,"[{'id': 256377, 'name': 'The Muppet Collection...",0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam..."
9,10,,6000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '..."


In [None]:
read_imdb_data.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [None]:
read_imdb_data.iloc[ 3: 5,  1: 4] # index 5 & 4 is excluded

Unnamed: 0,belongs_to_collection,budget,genres
3,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n..."
4,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam..."


In [None]:
read_imdb_data.loc[ read_imdb_data.index[-50:-40],  ]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2950,2951,"[{'id': 9735, 'name': 'Friday the 13th Collect...",30000000,"[{'id': 27, 'name': 'Horror'}]",,tt0329101,en,Freddy vs. Jason,Evil dream-demon Freddy Krueger devises a plan...,14.015739,...,8/15/03,97.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Evil Will Battle Evil,Freddy vs. Jason,"[{'id': 9826, 'name': 'murder'}, {'id': 10776,...","[{'cast_id': 1, 'character': 'Freddy Krueger',...","[{'credit_id': '52fe4453c3a36847f808f4a5', 'de...",114908830
2951,2952,,13000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,tt2057392,en,Eye in the Sky,A military officer in command of a drone opera...,10.564701,...,9/11/15,102.0,"[{'iso_639_1': 'so', 'name': 'Somali'}, {'iso_...",Released,Welcome to the new front line,Eye in the Sky,"[{'id': 949, 'name': 'terrorist'}, {'id': 1193...","[{'cast_id': 4, 'character': 'Colonel Katherin...","[{'credit_id': '55186708c3a36862f6004623', 'de...",18704595
2952,2953,,25000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",http://theageofadalinemovie.com,tt1655441,en,The Age of Adaline,After 29-year-old Adaline recovers from a near...,10.542026,...,4/16/15,112.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Love is timeless.,The Age of Adaline,"[{'id': 582, 'name': 'san francisco'}, {'id': ...","[{'cast_id': 0, 'character': 'Adaline Bowman',...","[{'credit_id': '54721093c3a3686792002bc5', 'de...",65663276
2953,2954,,8500000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,tt0202677,en,The Way of the Gun,Parker and Longbaugh are a pair of low-level p...,6.03633,...,9/8/00,119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,The Way of the Gun,"[{'id': 75, 'name': 'gunslinger'}, {'id': 534,...","[{'cast_id': 1, 'character': 'Mr. Longbaugh', ...","[{'credit_id': '52fe4305c3a36847f8034691', 'de...",19125401
2954,2955,,60000000,"[{'id': 35, 'name': 'Comedy'}]",,tt0375173,en,Alfie,"In Manhattan, the British limousine driver Alf...",10.411608,...,10/22/04,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Meet a man who never met a woman he didn't love.,Alfie,"[{'id': 242, 'name': 'new york'}, {'id': 1332,...","[{'cast_id': 1, 'character': 'Alfie', 'credit_...","[{'credit_id': '562643e89251413ded006a07', 'de...",13395939
2955,2956,,17000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,tt1210042,en,Brooklyn's Finest,Enforcing the law within the notoriously rough...,12.50422,...,1/16/09,133.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,This is War. This is Brooklyn.,Brooklyn's Finest,"[{'id': 255, 'name': 'male nudity'}, {'id': 29...","[{'cast_id': 3, 'character': 'Eddie Dugan', 'c...","[{'credit_id': '52fe4500c3a368484e042e7b', 'de...",29536299
2956,2957,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,tt0091055,en,Firewalker,A pair of adventurers try to track down an anc...,3.150899,...,11/21/86,104.0,"[{'iso_639_1': 'hu', 'name': 'Magyar'}, {'iso_...",Released,A pair of down-and-out fortune hunters cash in...,Firewalker,"[{'id': 352, 'name': 'secret passage'}, {'id':...","[{'cast_id': 1, 'character': 'Max Donigan', 'c...","[{'credit_id': '52fe45199251416c7504bbf7', 'de...",11949484
2957,2958,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,tt0113845,en,Money Train,A vengeful New York transit cop decides to ste...,7.337906,...,11/21/95,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Get on, or GET OUT THE WAY!",Money Train,"[{'id': 380, 'name': 'brother brother relation...","[{'cast_id': 1, 'character': 'John', 'credit_i...","[{'credit_id': '52fe44509251416c750305a1', 'de...",35431113
2958,2959,,15000000,"[{'id': 10402, 'name': 'Music'}, {'id': 99, 'n...",http://www.u23dmovie.com/,tt0892375,en,U2 3D,"A 3-D presentation of U2's global ""Vertigo"" to...",0.900864,...,5/19/07,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Hear See Experience,U2 3D,,"[{'cast_id': 0, 'character': 'Himself', 'credi...","[{'credit_id': '57bfd2089251414b1500001e', 'de...",22730842
2959,2960,,2300000,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,tt1093370,hi,Jab We Met,"Depressed after the passing of his father, Dha...",3.959411,...,10/26/07,145.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}, {'iso_...",Released,,Jab We Met,"[{'id': 2038, 'name': ""love of one's life""}, {...","[{'cast_id': 9, 'character': 'Aditya Kashyap',...","[{'credit_id': '537ef0d8c3a36805a1002844', 'de...",4600000


In [None]:
read_imdb_data.iloc[ -50:-40,  ] # last three rows and last 2 columns # No need to use .index here.

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2950,2951,"[{'id': 9735, 'name': 'Friday the 13th Collect...",30000000,"[{'id': 27, 'name': 'Horror'}]",,tt0329101,en,Freddy vs. Jason,Evil dream-demon Freddy Krueger devises a plan...,14.015739,...,8/15/03,97.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Evil Will Battle Evil,Freddy vs. Jason,"[{'id': 9826, 'name': 'murder'}, {'id': 10776,...","[{'cast_id': 1, 'character': 'Freddy Krueger',...","[{'credit_id': '52fe4453c3a36847f808f4a5', 'de...",114908830
2951,2952,,13000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,tt2057392,en,Eye in the Sky,A military officer in command of a drone opera...,10.564701,...,9/11/15,102.0,"[{'iso_639_1': 'so', 'name': 'Somali'}, {'iso_...",Released,Welcome to the new front line,Eye in the Sky,"[{'id': 949, 'name': 'terrorist'}, {'id': 1193...","[{'cast_id': 4, 'character': 'Colonel Katherin...","[{'credit_id': '55186708c3a36862f6004623', 'de...",18704595
2952,2953,,25000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",http://theageofadalinemovie.com,tt1655441,en,The Age of Adaline,After 29-year-old Adaline recovers from a near...,10.542026,...,4/16/15,112.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Love is timeless.,The Age of Adaline,"[{'id': 582, 'name': 'san francisco'}, {'id': ...","[{'cast_id': 0, 'character': 'Adaline Bowman',...","[{'credit_id': '54721093c3a3686792002bc5', 'de...",65663276
2953,2954,,8500000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,tt0202677,en,The Way of the Gun,Parker and Longbaugh are a pair of low-level p...,6.03633,...,9/8/00,119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,The Way of the Gun,"[{'id': 75, 'name': 'gunslinger'}, {'id': 534,...","[{'cast_id': 1, 'character': 'Mr. Longbaugh', ...","[{'credit_id': '52fe4305c3a36847f8034691', 'de...",19125401
2954,2955,,60000000,"[{'id': 35, 'name': 'Comedy'}]",,tt0375173,en,Alfie,"In Manhattan, the British limousine driver Alf...",10.411608,...,10/22/04,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Meet a man who never met a woman he didn't love.,Alfie,"[{'id': 242, 'name': 'new york'}, {'id': 1332,...","[{'cast_id': 1, 'character': 'Alfie', 'credit_...","[{'credit_id': '562643e89251413ded006a07', 'de...",13395939
2955,2956,,17000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,tt1210042,en,Brooklyn's Finest,Enforcing the law within the notoriously rough...,12.50422,...,1/16/09,133.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,This is War. This is Brooklyn.,Brooklyn's Finest,"[{'id': 255, 'name': 'male nudity'}, {'id': 29...","[{'cast_id': 3, 'character': 'Eddie Dugan', 'c...","[{'credit_id': '52fe4500c3a368484e042e7b', 'de...",29536299
2956,2957,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,tt0091055,en,Firewalker,A pair of adventurers try to track down an anc...,3.150899,...,11/21/86,104.0,"[{'iso_639_1': 'hu', 'name': 'Magyar'}, {'iso_...",Released,A pair of down-and-out fortune hunters cash in...,Firewalker,"[{'id': 352, 'name': 'secret passage'}, {'id':...","[{'cast_id': 1, 'character': 'Max Donigan', 'c...","[{'credit_id': '52fe45199251416c7504bbf7', 'de...",11949484
2957,2958,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,tt0113845,en,Money Train,A vengeful New York transit cop decides to ste...,7.337906,...,11/21/95,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Get on, or GET OUT THE WAY!",Money Train,"[{'id': 380, 'name': 'brother brother relation...","[{'cast_id': 1, 'character': 'John', 'credit_i...","[{'credit_id': '52fe44509251416c750305a1', 'de...",35431113
2958,2959,,15000000,"[{'id': 10402, 'name': 'Music'}, {'id': 99, 'n...",http://www.u23dmovie.com/,tt0892375,en,U2 3D,"A 3-D presentation of U2's global ""Vertigo"" to...",0.900864,...,5/19/07,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Hear See Experience,U2 3D,,"[{'cast_id': 0, 'character': 'Himself', 'credi...","[{'credit_id': '57bfd2089251414b1500001e', 'de...",22730842
2959,2960,,2300000,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,tt1093370,hi,Jab We Met,"Depressed after the passing of his father, Dha...",3.959411,...,10/26/07,145.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}, {'iso_...",Released,,Jab We Met,"[{'id': 2038, 'name': ""love of one's life""}, {...","[{'cast_id': 9, 'character': 'Aditya Kashyap',...","[{'credit_id': '537ef0d8c3a36805a1002844', 'de...",4600000


In [None]:
read_imdb_data.iloc[ [3,6,9],  [3,6,7] ] # Custom Indexes

Unnamed: 0,genres,original_language,original_title
3,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",hi,Kahaani
6,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",en,The Possession
9,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",en,A Mighty Wind


In [None]:
# [True,True]+[False]*2998 # 3000 rows

In [None]:
read_imdb_data.shape

(3000, 23)

In [None]:
## Boolean Index
# Use of conditional and logical operator
list_boolean = [True,False,True]+[False]*2997
read_imdb_data.loc[list_boolean,] # Understand this interesting concept

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000


In [None]:
read_imdb_data.head(5)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [None]:
df

Unnamed: 0,name,age
A,John,28
B,David,20
C,Reva,32
D,Grahm,40


In [None]:
df.loc[ [True,False, False, True] , ]

Unnamed: 0,name,age
A,John,28
D,Grahm,40


In [None]:
df.iloc[ [True,False,True,True] , ]

Unnamed: 0,name,age
A,John,28
C,Reva,32
D,Grahm,40


In [None]:
filter_boolean = read_imdb_data.loc[:,['runtime']] > 120
print(filter_boolean)

      runtime
0       False
1       False
2       False
3        True
4       False
...       ...
2995    False
2996    False
2997    False
2998    False
2999    False

[3000 rows x 1 columns]


In [None]:
filter_boolean['runtime']

0       False
1       False
2       False
3        True
4       False
        ...  
2995    False
2996    False
2997    False
2998    False
2999    False
Name: runtime, Length: 3000, dtype: bool

In [None]:
filter_boolean = read_imdb_data.loc[:,['runtime']] > 120
read_imdb_data_120plus = read_imdb_data.loc[filter_boolean['runtime'],] # Conditional Subsetting

In [None]:
# read_imdb_data_120plus.head()

In [None]:
# ~[True, False, True] --> [False, True, False]

In [None]:
read_imdb_data.loc[~filter_boolean['runtime'],] # Reverse Conditional Subsetting

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,...,8/6/87,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night,,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,4/22/94,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687
2996,2997,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,3/28/13,102.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,10/11/96,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,1/16/04,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386


In [None]:
# Multiple conditions
condition1 = read_imdb_data.loc[:,['runtime']] > 120
condition2 = read_imdb_data.loc[:,['original_language']] =='en'
# read_imdb_data.loc[condition1['runtime'] & condition2['original_language'],] # and condition - Note : here & is used not 'and'
read_imdb_data.loc[condition1['runtime'] | condition2['original_language'],] # or condition - Note : here | is used not 'or'

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,...,8/6/87,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night,,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,2995,,0,"[{'id': 18, 'name': 'Drama'}]",,tt0105327,en,School Ties,When David Greene receives a football scholars...,7.438381,...,9/18/92,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just because you’re accepted doesn’t mean you ...,School Ties,"[{'id': 6075, 'name': 'sport'}, {'id': 10144, ...","[{'cast_id': 2, 'character': 'David Greene', '...","[{'credit_id': '5637777ac3a3681b4d01f9f5', 'de...",14715067
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,4/22/94,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,10/11/96,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,1/16/04,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386


In [None]:
# Important Points to remember :
# iloc vs loc
# --> loc : Index& columns names , iloc : Index number and Column number is considered
# --> loc : Range (a:b) , b is included, iloc:Range (a:b) , b is excluded

### Creating new columns, renaming & droping

#### Rename

In [None]:
# dataframe.rename(columns={'column_name':'new_column_name'},inplace=True) # Syntax

In [None]:
read_imdb_data.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [None]:
# read_imdb_data.rename(columns = {'budget':'movie_budget','cast':'movie_cast'})
read_imdb_data.rename(columns = {'budget':'movie_budget','cast':'movie_cast'}, inplace= True)

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,movie_budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,movie_cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


#### New Columns

In [None]:
# read_imdb_data['runtime'] #-- This feature is going to be removed

In [None]:
read_imdb_data.loc[:,'half_runtime'] = read_imdb_data.loc[:, 'runtime'] * 0.5
read_imdb_data

Unnamed: 0,id,belongs_to_collection,movie_budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,Keywords,movie_cast,crew,revenue,half_runtime
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,46.5
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,56.5
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,...,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,52.5
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,61.0
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,...,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687,51.0
2996,2997,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,102.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590,51.0
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761,60.0
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386,45.0


In [None]:
read_imdb_data.loc[:,'revenue_runtime_ratio'] = (read_imdb_data.loc[:, 'revenue'] / read_imdb_data.loc[:, 'runtime'])
read_imdb_data

Unnamed: 0,id,belongs_to_collection,movie_budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,Keywords,movie_cast,crew,revenue,half_runtime,revenue_runtime_ratio
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,46.5,1.324156e+05
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,56.5,8.420304e+05
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,52.5,1.246857e+05
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,61.0,1.311475e+05
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,...,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,59.0,3.325398e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687,51.0,1.565379e+04
2996,2997,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590,51.0,1.770490e+03
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761,60.0,7.454730e+05
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386,45.0,1.910704e+06


In [None]:
read_imdb_data['check'] = list(range(1,3001))

In [None]:
# len(list(range(1,3001)))

In [None]:
read_imdb_data.head()

Unnamed: 0,id,belongs_to_collection,movie_budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,status,tagline,title,Keywords,movie_cast,crew,revenue,half_runtime,revenue_runtime_ratio,check
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,46.5,132415.602151,1
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,56.5,842030.39823,2
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,52.5,124685.714286,3
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,61.0,131147.540984,4
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,59.0,33253.983051,5


#### Drop columns & rows

In [None]:
read_imdb_data.drop(['check'],axis = 1, inplace = True) # Drop column
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,movie_budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,Keywords,movie_cast,crew,revenue,half_runtime,revenue_runtime_ratio
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,46.5,132415.602151


In [None]:
read_imdb_data.drop([0,1,2], axis = 0) # Drop rows

Unnamed: 0,id,belongs_to_collection,movie_budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,Keywords,movie_cast,crew,revenue,half_runtime,revenue_runtime_ratio
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,61.0,1.311475e+05
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,...,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,59.0,3.325398e+04
5,6,,8000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Pinocchio and the Emperor of the Night,,"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638,41.5,3.929684e+04
6,7,,14000000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",http://www.thepossessionmovie.com/,tt0431021,en,The Possession,A young girl buys an antique box at a yard sal...,7.286477,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear The Demon That Doesn't Fear God,The Possession,,"[{'cast_id': 23, 'character': 'Clyde', 'credit...","[{'credit_id': '52fe4981c3a368484e12ee29', 'de...",85446075,46.0,9.287617e+05
7,8,,0,"[{'id': 99, 'name': 'Documentary'}]",,tt0391024,en,Control Room,A chronicle which provides a rare window into ...,1.949044,...,"[{'iso_639_1': 'ar', 'name': 'العربية'}, {'iso...",Released,Different channels. Different truths.,Control Room,"[{'id': 917, 'name': 'journalism'}, {'id': 163...","[{'cast_id': 2, 'character': 'Himself', 'credi...","[{'credit_id': '52fe47a69251416c750a0daf', 'de...",2586511,42.0,3.079180e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687,51.0,1.565379e+04
2996,2997,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590,51.0,1.770490e+03
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761,60.0,7.454730e+05
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386,45.0,1.910704e+06


In [None]:
# .drop : names of index or column, axis default is 0

### Handling Missing Values

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
# Identify, Drop or fill
# isna(),dropna(), fillna()

In [None]:
3000-604

2396

In [None]:
read_imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  604 non-null    object 
 2   budget                 3000 non-null   int64  
 3   genres                 2993 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   2844 non-null   object 
 12  production_countries   2945 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

In [None]:
read_imdb_data.loc[:,'belongs_to_collection'].head(10)

0    [{'id': 313576, 'name': 'Hot Tub Time Machine ...
1    [{'id': 107674, 'name': 'The Princess Diaries ...
2                                                  NaN
3                                                  NaN
4                                                  NaN
5                                                  NaN
6                                                  NaN
7                                                  NaN
8    [{'id': 256377, 'name': 'The Muppet Collection...
9                                                  NaN
Name: belongs_to_collection, dtype: object

In [None]:
sum(read_imdb_data.loc[:,'belongs_to_collection'].isna())/read_imdb_data.shape[0]

0.7986666666666666

In [None]:
read_imdb_data.loc[:,'belongs_to_collection'].isna()

0       False
1       False
2        True
3        True
4        True
        ...  
2995     True
2996     True
2997     True
2998     True
2999     True
Name: belongs_to_collection, Length: 3000, dtype: bool

In [None]:
read_imdb_data.belongs_to_collection.isna() #Method 1

0       False
1       False
2        True
3        True
4        True
        ...  
2995     True
2996     True
2997     True
2998     True
2999     True
Name: belongs_to_collection, Length: 3000, dtype: bool

In [None]:
read_imdb_data['belongs_to_collection'].isna() # Method 2

0       False
1       False
2        True
3        True
4        True
        ...  
2995     True
2996     True
2997     True
2998     True
2999     True
Name: belongs_to_collection, Length: 3000, dtype: bool

In [None]:
read_imdb_data.loc[:,'belongs_to_collection'].isna()

0       False
1       False
2        True
3        True
4        True
        ...  
2995     True
2996     True
2997     True
2998     True
2999     True
Name: belongs_to_collection, Length: 3000, dtype: bool

In [None]:
# Identify - subset the data as per the null values
read_imdb_data.loc[read_imdb_data.belongs_to_collection.isna(),].head(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000


In [None]:
read_imdb_data.dropna() # Remove all rows having Na anywhere

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
22,23,"[{'id': 207621, 'name': 'V/H/S Collection', 'p...",0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",http://www.magnetreleasing.com/vhs/,tt2105044,en,V/H/S,When a group of misfits is hired by an unknown...,7.820787,...,7/28/12,116.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,This collection is killer.,V/H/S,"[{'id': 6435, 'name': 'webcam'}, {'id': 9706, ...","[{'cast_id': 1, 'character': 'Gary', 'credit_i...","[{'credit_id': '52fe48ee9251416c9109d113', 'de...",100345
40,41,"[{'id': 376970, 'name': 'Cocaine Cowboys', 'po...",0,"[{'id': 28, 'name': 'Action'}, {'id': 99, 'nam...",http://www.magpictures.com/profile.aspx?id=983...,tt0380268,en,Cocaine Cowboys,"In the 1980s, ruthless Colombian cocaine baron...",2.941626,...,4/26/06,118.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,How Miami became the cocaine capital of the Un...,Cocaine Cowboys,"[{'id': 2150, 'name': 'cocaine'}, {'id': 2231,...","[{'cast_id': 2, 'character': 'Himself', 'credi...","[{'credit_id': '52fe46179251416c7506cc15', 'de...",163000
46,47,"[{'id': 8354, 'name': 'Ice Age Collection', 'p...",80000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://www.iceagemovies.com/films/ice-age-the-...,tt0438097,en,Ice Age: The Meltdown,"Diego, Manny and Sid return in this sequel to ...",16.646029,...,3/23/06,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Ice age is melting away.,Ice Age: The Meltdown,"[{'id': 2078, 'name': 'mammoth'}, {'id': 2079,...","[{'cast_id': 1, 'character': 'Manny (voice)', ...","[{'credit_id': '52fe4292c3a36847f80292c9', 'de...",660940780
60,61,"[{'id': 645, 'name': 'James Bond Collection', ...",150000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/#/our-titles/233/Casino-Roy...,tt0381061,en,Casino Royale,"Le Chiffre, a banker to the world's terrorists...",23.065078,...,11/14/06,144.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Everyone has a past. Every legend has a beginn...,Casino Royale,"[{'id': 131, 'name': 'italy'}, {'id': 383, 'na...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '53ac1fd7c3a3684bc8001908', 'de...",599045960
69,70,"[{'id': 9485, 'name': 'The Fast and the Furiou...",85000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",http://www.thefastandthefurious3.com/,tt0463985,en,The Fast and the Furious: Tokyo Drift,"In order to avoid a jail sentence, Sean Boswel...",2.238808,...,6/3/06,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"On the streets of Tokyo, speed needs no transl...",The Fast and the Furious: Tokyo Drift,"[{'id': 830, 'name': 'car race'}, {'id': 1926,...","[{'cast_id': 12, 'character': 'Sean Boswell', ...","[{'credit_id': '56784ed792514111900002c8', 'de...",158468292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2904,2905,"[{'id': 424202, 'name': 'Trainspotting Collect...",4000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.miramax.com/movie/trainspotting/,tt0117951,en,Trainspotting,"Renton, deeply immersed in the Edinburgh drug ...",19.348466,...,2/23/96,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Choose life.,Trainspotting,"[{'id': 212, 'name': 'london england'}, {'id':...","[{'cast_id': 17, 'character': 'Mark Renton', '...","[{'credit_id': '52fe4260c3a36847f80198c1', 'de...",16491080
2927,2928,"[{'id': 645, 'name': 'James Bond Collection', ...",140000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/232/Die-Another-...,tt0246460,en,Die Another Day,Bond takes on a North Korean leader who underg...,12.996474,...,11/17/02,133.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,He’s never been cooler.,Die Another Day,"[{'id': 3290, 'name': 'laser'}, {'id': 156095,...","[{'cast_id': 20, 'character': 'James Bond', 'c...","[{'credit_id': '52fe45ff9251416c91045a7f', 'de...",431971116
2938,2939,"[{'id': 135416, 'name': 'Prometheus Collection...",130000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://www.projectprometheus.com/,tt1446714,en,Prometheus,A team of explorers discover a clue to the ori...,16.624854,...,5/30/12,124.0,"[{'iso_639_1': 'gd', 'name': ''}, {'iso_639_1'...",Released,The Search for Our Beginning Could Lead to Our...,Prometheus,"[{'id': 803, 'name': 'android'}, {'id': 4565, ...","[{'cast_id': 5, 'character': 'Dr. Elizabeth Sh...","[{'credit_id': '52fe481dc3a368484e0e9db7', 'de...",403170142
2968,2969,"[{'id': 97307, 'name': 'BloodRayne Collection'...",25000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.bloodrayne-themovie.com/main/index....,tt0383222,en,BloodRayne,"In eighteenth century Romania, Rayne, a dhampi...",6.514132,...,10/22/05,95.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Revenge never tasted so sweet.,BloodRayne,"[{'id': 3133, 'name': 'vampire'}, {'id': 9259,...","[{'cast_id': 3, 'character': 'Rayne', 'credit_...","[{'credit_id': '52fe4cd4c3a36847f82407f7', 'de...",2405420


In [None]:
read_imdb_data.shape

(3000, 23)

In [None]:
# read_imdb_data.loc[~read_imdb_data.loc[:,'belongs_to_collection'].isna(),]

In [None]:
# fillna()
read_imdb_data_copy = read_imdb_data.copy()

In [None]:
# Full data impute with some value
read_imdb_data_copy.fillna('Value not available')

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",Value not available,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Value not available,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,Value not available,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,Value not available,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Value not available,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,Value not available,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",Value not available,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,Value not available,Marine Boy,Value not available,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Value not available,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",Value not available,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,4/22/94,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687
2996,2997,Value not available,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",Value not available,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,3/28/13,102.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,Value not available,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590
2997,2998,Value not available,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",Value not available,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,10/11/96,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761
2998,2999,Value not available,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,1/16/04,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386


In [None]:
# Fillna on specific column
read_imdb_data_copy.belongs_to_collection = read_imdb_data_copy.belongs_to_collection.fillna(' Missing Value')

In [None]:
read_imdb_data_copy.head(5)


Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,Missing Value,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,Missing Value,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,Missing Value,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [None]:
(60+62+98)/3

73.33333333333333

In [None]:
# Fill the null value
# Categorical - 'Missing'/'Others'/Most frequent value
# Continous - mean / median

In [None]:
# Visualisation --: Outlier

### Sorting & Handling Duplicates

In [None]:
# Sorting : sort_values

In [None]:
# Creating a DataFrame
data = {'A': [1, 3, 2, 1, 2],
        'B': ['a', 'b', 'c', 'a', 'b'],
        'C':[0, 1, 9, 0, 2]}
df = pd.DataFrame(data)
df



Unnamed: 0,A,B,C
0,1,a,0
1,3,b,1
2,2,c,9
3,1,a,0
4,2,b,2


In [None]:
# Sorting by column A
df_sorted = df.sort_values(by='A') # default ascending

print(df_sorted)

   A  B  C
0  1  a  0
3  1  a  0
2  2  c  9
4  2  b  2
1  3  b  1


In [None]:
# Sorting by column A -  Descending
df.sort_values(by='A', ascending = False)

Unnamed: 0,A,B,C
1,3,b,1
2,2,c,9
4,2,b,2
0,1,a,0
3,1,a,0


In [None]:
# Sorting by column A & Column B
df.sort_values(by= ['A','B'], ascending = [False,False])

Unnamed: 0,A,B,C
1,3,b,1
2,2,c,9
4,2,b,2
0,1,a,0
3,1,a,0


In [None]:
df.sort_values(by= ['A','B'], ascending = [False,True])

Unnamed: 0,A,B,C
1,3,b,1
4,2,b,2
2,2,c,9
0,1,a,0
3,1,a,0


In [None]:
df.sort_values(by= ['B','A'], ascending = [False,True])

Unnamed: 0,A,B,C
2,2,c,9
4,2,b,2
1,3,b,1
0,1,a,0
3,1,a,0


In [None]:
# Sorting by column A & Column B : A Ascending & B descending
df.sort_values(by=['A','B'], ascending = [True,False])

Unnamed: 0,A,B,C
0,1,a,0
3,1,a,0
2,2,c,9
4,2,b,2
1,3,b,1


In [None]:
# Implement it on IMDB data
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
# read_imdb_data.sort_values(by = 'runtime',ascending  = False)

In [None]:
df

Unnamed: 0,A,B,C
0,1,a,0
1,3,b,1
2,2,c,9
3,1,a,0
4,2,b,2


In [None]:
# groupby([all columns])['id'].count()

In [None]:
df.loc[:,['A','B']].value_counts().reset_index()

Unnamed: 0,A,B,count
0,1,a,2
1,2,b,1
2,2,c,1
3,3,b,1


In [None]:
sum(df.value_counts().reset_index()['count']>1)

1

In [None]:
## value_counts() : Count the value # and create the column count.
sum(df.value_counts().reset_index()['count']>1) # How many rows are there having duplicated values?

1

In [None]:
# Convert it dataframe format
df.value_counts().reset_index()

Unnamed: 0,A,B,C,count
0,1,a,0,2
1,2,b,2,1
2,2,c,9,1
3,3,b,1,1


In [None]:
read_imdb_data_dup = pd.concat([read_imdb_data,read_imdb_data.iloc[40:50,:]]) # Created the duplicate records

In [None]:
read_imdb_data_dup.value_counts().reset_index().sort_values('count',ascending = False)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,count
0,41,"[{'id': 376970, 'name': 'Cocaine Cowboys', 'po...",0,"[{'id': 28, 'name': 'Action'}, {'id': 99, 'nam...",http://www.magpictures.com/profile.aspx?id=983...,tt0380268,en,Cocaine Cowboys,"In the 1980s, ruthless Colombian cocaine baron...",2.941626,...,118.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,How Miami became the cocaine capital of the Un...,Cocaine Cowboys,"[{'id': 2150, 'name': 'cocaine'}, {'id': 2231,...","[{'cast_id': 2, 'character': 'Himself', 'credi...","[{'credit_id': '52fe46179251416c7506cc15', 'de...",163000,2
1,47,"[{'id': 8354, 'name': 'Ice Age Collection', 'p...",80000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://www.iceagemovies.com/films/ice-age-the-...,tt0438097,en,Ice Age: The Meltdown,"Diego, Manny and Sid return in this sequel to ...",16.646029,...,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Ice age is melting away.,Ice Age: The Meltdown,"[{'id': 2078, 'name': 'mammoth'}, {'id': 2079,...","[{'cast_id': 1, 'character': 'Manny (voice)', ...","[{'credit_id': '52fe4292c3a36847f80292c9', 'de...",660940780,2
201,1116,"[{'id': 645, 'name': 'James Bond Collection', ...",14000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/1891/The-Spy-Who...,tt0076752,en,The Spy Who Loved Me,Russian and British submarines with nuclear mi...,9.781451,...,125.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,It's the BIGGEST. It's the BEST. It's BOND. An...,The Spy Who Loved Me,"[{'id': 212, 'name': 'london england'}, {'id':...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426bc3a36847f801d187', 'de...",185438673,1
138,208,"[{'id': 286162, 'name': 'Power Rangers Collect...",15000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.powerrangers.com/,tt0113820,en,Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,7.024227,...,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Power Is On!,Mighty Morphin Power Rangers: The Movie,"[{'id': 10988, 'name': 'based on tv series'}, ...","[{'cast_id': 2, 'character': 'Kimberly Hart / ...","[{'credit_id': '52fe44d8c3a36847f80ad707', 'de...",66000000,1
128,539,"[{'id': 84, 'name': 'Indiana Jones Collection'...",48000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.indianajones.com/crusade,tt0097576,en,Indiana Jones and the Last Crusade,When Dr. Henry Jones Sr. suddenly goes missing...,14.788987,...,127.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,"The man with the hat is back. And this time, h...",Indiana Jones and the Last Crusade,"[{'id': 83, 'name': 'saving the world'}, {'id'...","[{'cast_id': 8, 'character': 'Indiana Jones', ...","[{'credit_id': '52fe4216c3a36847f8002e69', 'de...",474171806,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,2891,"[{'id': 14563, 'name': 'The Ring Collection', ...",25000000,"[{'id': 27, 'name': 'Horror'}]",http://www.ringsmovie.com/,tt0498381,en,Rings,"Julia becomes worried about her boyfriend, Hol...",24.535733,...,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Evil is reborn.,Rings,"[{'id': 3298, 'name': 'hallucination'}, {'id':...","[{'cast_id': 14, 'character': 'Julia', 'credit...","[{'credit_id': '52fe46019251416c75069e25', 'de...",83080890,1
71,2892,"[{'id': 479888, 'name': 'The Thing Collection'...",35000000,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",http://www.uphe.com/movies/the-thing-2011,tt0905372,en,The Thing,When paleontologist Kate Lloyd travels to an i...,10.169411,...,103.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,It's Not Human. Yet.,The Thing,"[{'id': 2340, 'name': 'paranoia'}, {'id': 4713...","[{'cast_id': 2, 'character': 'Kate Lloyd', 'cr...","[{'credit_id': '537b713fc3a3682d3c00000e', 'de...",28128670,1
72,2905,"[{'id': 424202, 'name': 'Trainspotting Collect...",4000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.miramax.com/movie/trainspotting/,tt0117951,en,Trainspotting,"Renton, deeply immersed in the Edinburgh drug ...",19.348466,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Choose life.,Trainspotting,"[{'id': 212, 'name': 'london england'}, {'id':...","[{'cast_id': 17, 'character': 'Mark Renton', '...","[{'credit_id': '52fe4260c3a36847f80198c1', 'de...",16491080,1
73,2928,"[{'id': 645, 'name': 'James Bond Collection', ...",140000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/232/Die-Another-...,tt0246460,en,Die Another Day,Bond takes on a North Korean leader who underg...,12.996474,...,133.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,He’s never been cooler.,Die Another Day,"[{'id': 3290, 'name': 'laser'}, {'id': 156095,...","[{'cast_id': 20, 'character': 'James Bond', 'c...","[{'credit_id': '52fe45ff9251416c91045a7f', 'de...",431971116,1


In [None]:
# Convert it dataframe format
df.value_counts().reset_index().sort_values('count')

Unnamed: 0,A,B,C,count
1,2,b,2,1
2,2,c,9,1
3,3,b,1,1
0,1,a,0,2


In [None]:
df['B'].value_counts().reset_index()

Unnamed: 0,B,count
0,a,2
1,b,2
2,c,1


In [None]:
df

Unnamed: 0,A,B,C
0,1,a,0
1,3,b,1
2,2,c,9
3,1,a,0
4,2,b,2


In [None]:
# Drop duplicates
# .drop_duplicates() , subset & keep parameters
df.drop_duplicates() # Drop duplicates for the dataframe # Load the data in dataframe.

Unnamed: 0,A,B,C
0,1,a,0
1,3,b,1
2,2,c,9
4,2,b,2


In [None]:
df.sort_values(by = 'A') # Column A as customer

Unnamed: 0,A,B,C
0,1,a,0
3,1,a,0
2,2,c,9
4,2,b,2
1,3,b,1


In [None]:
df

Unnamed: 0,A,B,C
0,1,a,0
1,3,b,1
2,2,c,9
3,1,a,0
4,2,b,2


In [None]:
df.drop_duplicates(subset = ['A','B'], keep = 'last')

Unnamed: 0,A,B,C
1,3,b,1
2,2,c,9
3,1,a,0
4,2,b,2


In [None]:
# Last : latest
# First : first punched :: issue with API, Database

### Aggregating Data

In [None]:
# .groupby(), .agg()

# sum(): Computes the sum of all values in a series or dataframe.
# mean(): Computes the mean (average) of all values in a series or dataframe.
# median(): Computes the median (middle value) of all values in a series or dataframe.
# min(): Computes the minimum value in a series or dataframe.
# max(): Computes the maximum value in a series or dataframe.
# count(): Computes the number of non-null values in a series or dataframe.

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
# read_imdb_data.loc[:,'runtime'].sum()
# read_imdb_data['mean_runtime'] = read_imdb_data.runtime.mean()

In [None]:
# pd.mean(read_imdb_data['runtime'])

AttributeError: module 'pandas' has no attribute 'mean'

In [None]:
sum(list(read_imdb_data['runtime'].fillna(0)))

323354.0

#### .groupby()

In [None]:
read_imdb_data.groupby('original_language')['id'].count().reset_index().sort_values(by = 'id', ascending =False)

Unnamed: 0,original_language,id
7,en,2575
11,fr,78
27,ru,47
8,es,43
13,hi,42
17,ja,37
16,it,24
18,ko,20
2,cn,20
35,zh,19


In [None]:
read_imdb_data.groupby('original_language')['runtime'].mean().reset_index().sort_values(by='runtime',ascending = False)

Unnamed: 0,original_language,runtime
20,mr,162.0
30,ta,159.875
15,id,150.0
13,hi,148.880952
31,te,146.0
19,ml,137.0
34,vi,135.0
14,hu,126.0
1,bn,125.0
29,sv,122.375


In [None]:
read_imdb_data.groupby('original_language')['revenue'].mean().reset_index().sort_values(by='revenue',ascending = False)

Unnamed: 0,original_language,revenue
7,en,74665910.0
35,zh,70376370.0
32,tr,51663410.0
17,ja,30651800.0
2,cn,29772890.0
13,hi,25346370.0
5,de,20530900.0
11,fr,17132570.0
16,it,16415130.0
24,pl,15010830.0


In [None]:
read_imdb_data.groupby('status')['id'].count()

status
Released    2996
Rumored        4
Name: id, dtype: int64

In [None]:
# Rows where status = Rumored
# read_imdb_data.loc[read_imdb_data.status =='Rumored',]

In [None]:
read_imdb_data.groupby(['original_language','status'])['id'].count().reset_index()

#### .agg()

In [None]:
# .agg()
read_imdb_data.groupby(['original_language','status'])['runtime'].agg(['count','mean','sum']).reset_index()

Unnamed: 0,original_language,status,count,mean,sum
0,ar,Released,1,98.0,98.0
1,bn,Released,1,125.0,125.0
2,cn,Released,20,105.45,2109.0
3,cs,Released,1,90.0,90.0
4,da,Released,5,105.4,527.0
5,de,Released,17,122.294118,2079.0
6,el,Released,1,91.0,91.0
7,en,Released,2571,107.030338,275175.0
8,en,Rumored,4,77.25,309.0
9,es,Released,43,100.372093,4316.0


In [None]:
read_imdb_data.groupby(['status', 'original_language']).agg({'runtime':"mean",'revenue': "max",'imdb_id':"count"}).reset_index()

Unnamed: 0,status,original_language,runtime,revenue,imdb_id
0,Released,ar,98.0,1347747,1
1,Released,bn,125.0,536364,1
2,Released,cn,105.45,156844753,20
3,Released,cs,90.0,17393,1
4,Released,da,105.4,42070000,5
5,Released,de,122.294118,92180910,18
6,Released,el,91.0,8000000,1
7,Released,en,107.030338,1519557910,2571
8,Released,es,100.372093,67872296,43
9,Released,fa,96.4,2402067,5


### Joining Dataframes

In [None]:
# .merge(), .concat()
# Inner join, left join, right join, outer join
# pd.concat( [df] , axis)

# pd.merge(df1, df2 ,on,how)

#### .concat()

In [None]:
# read_imdb_data.original_language.value_counts()

In [None]:
read_imdb_data_en = read_imdb_data.loc[read_imdb_data.original_language == 'en',]
read_imdb_data_other = read_imdb_data.loc[read_imdb_data.original_language != 'en',]

In [None]:
read_imdb_data_en.shape

(2575, 24)

In [None]:
read_imdb_data_other.shape

(425, 24)

In [None]:
df_final = pd.concat([read_imdb_data_en,read_imdb_data_other],axis = 0)

In [None]:
df_final.shape

(3000, 24)

In [None]:
data1 = {'ID': [1, 2, 3],
         'Name': ['John', 'Alice', 'Bob'],
         'Age': [30, 25, 35]}
df1 = pd.DataFrame(data1)
df1


Unnamed: 0,ID,Name,Age
0,1,John,30
1,2,Alice,25
2,3,Bob,35


In [None]:
data2 = {'Salary': [50000, 60000, 70000]}
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Salary
0,50000
1,60000
2,70000


In [None]:
pd.concat([df1,df2],axis = 1)

Unnamed: 0,ID,Name,Age,Salary
0,1,John,30,50000
1,2,Alice,25,60000
2,3,Bob,35,70000


#### .merge()

In [None]:
data1 = {'ID': [1, 2, 3, 4],
         'Name': ['John', 'Alice', 'Bob', 'Charlie'],
         'Age': [30, 25, 35, 40]}
df1 = pd.DataFrame(data1)
data2 = {'salary_ID': [2, 3, 5, 6],
         'Salary': [50000, 60000, 70000, 80000]}
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

   ID     Name  Age
0   1     John   30
1   2    Alice   25
2   3      Bob   35
3   4  Charlie   40
   salary_ID  Salary
0          2   50000
1          3   60000
2          5   70000
3          6   80000


In [None]:
# Inner
pd.merge(df1,df2, left_on = 'ID', right_on = 'salary_ID', how ='inner')

Unnamed: 0,ID,Name,Age,salary_ID,Salary
0,2,Alice,25,2,50000
1,3,Bob,35,3,60000


In [None]:
# left
pd.merge(df1,df2, left_on = 'ID', right_on = 'salary_ID', how ='left')

Unnamed: 0,ID,Name,Age,salary_ID,Salary
0,1,John,30,,
1,2,Alice,25,2.0,50000.0
2,3,Bob,35,3.0,60000.0
3,4,Charlie,40,,


In [None]:
# right
pd.merge(df1,df2, left_on = 'ID', right_on = 'salary_ID', how ='right')

Unnamed: 0,ID,Name,Age,salary_ID,Salary
0,2.0,Alice,25.0,2,50000
1,3.0,Bob,35.0,3,60000
2,,,,5,70000
3,,,,6,80000


In [None]:
# outer
pd.merge(df1,df2, left_on = 'ID', right_on = 'salary_ID', how ='outer')

Unnamed: 0,ID,Name,Age,salary_ID,Salary
0,1.0,John,30.0,,
1,2.0,Alice,25.0,2.0,50000.0
2,3.0,Bob,35.0,3.0,60000.0
3,4.0,Charlie,40.0,,
4,,,,5.0,70000.0
5,,,,6.0,80000.0


In [None]:
# Cross join :: Try to search for it.
# Key
df1['key'] = 1
df2['key'] = 1
pd.merge(df1,df2, on = 'key', how = 'outer')

Unnamed: 0,ID,Name,Age,key,salary_ID,Salary
0,1,John,30,1,2,50000
1,1,John,30,1,3,60000
2,1,John,30,1,5,70000
3,1,John,30,1,6,80000
4,2,Alice,25,1,2,50000
5,2,Alice,25,1,3,60000
6,2,Alice,25,1,5,70000
7,2,Alice,25,1,6,80000
8,3,Bob,35,1,2,50000
9,3,Bob,35,1,3,60000


### .replace(),.apply()

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.original_language.value_counts().reset_index()

Unnamed: 0,original_language,count
0,en,2575
1,fr,78
2,ru,47
3,es,43
4,hi,42
5,ja,37
6,it,24
7,cn,20
8,ko,20
9,zh,19


In [None]:
read_imdb_data.original_language.replace({'en':'english','hi':'hindi'})

0       english
1       english
2       english
3         hindi
4            ko
         ...   
2995    english
2996         sv
2997    english
2998    english
2999    english
Name: original_language, Length: 3000, dtype: object

In [None]:
### how do we replace rest to others? --> We use apply with custom function

#### .apply()

In [None]:
read_imdb_data.original_language

0       en
1       en
2       en
3       hi
4       ko
        ..
2995    en
2996    sv
2997    en
2998    en
2999    en
Name: original_language, Length: 3000, dtype: object

In [None]:
def lang_map(l): # custom function
  if l == 'en':
    return 'english'
  elif l == 'hi':
    return 'hindi'
  else:
    return 'others'

lang_map(l = 'en')

'english'

In [None]:
read_imdb_data.original_language = read_imdb_data.original_language.apply(lang_map) # Applied to each element

In [None]:
def first_letter(s):
  return s[0]

In [None]:
read_imdb_data.original_language.apply(first_letter)

0       e
1       e
2       e
3       h
4       k
       ..
2995    e
2996    s
2997    e
2998    e
2999    e
Name: original_language, Length: 3000, dtype: object

In [None]:
def lang_chang(l):
  if l == 'en':
    return 'english'
  elif l == 'hi':
    return 'hindi'
  else:
    return l

In [None]:
read_imdb_data.original_language.apply(lang_chang)

0       english
1       english
2       english
3         hindi
4            ko
         ...   
2995    english
2996         sv
2997    english
2998    english
2999    english
Name: original_language, Length: 3000, dtype: object

### Dealing with Strings, Datetime & Json data in column

##### Strings

In [None]:
# Important functions
# str.contains(), str.find(''),str.replace(old,new), str.split('delimiter'), str.strip(), str.extract() and str.extractall()

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.title

0                         Hot Tub Time Machine 2
1       The Princess Diaries 2: Royal Engagement
2                                       Whiplash
3                                        Kahaani
4                                     Marine Boy
                          ...                   
2995                                     Chasers
2996                            We Are the Best!
2997                     The Long Kiss Goodnight
2998                            Along Came Polly
2999                                   Abduction
Name: title, Length: 3000, dtype: object

In [None]:
# str.contains()
# read_imdb_data.title.str.contains('Time') # Exact match as substring
# read_imdb_data.title.str.contains(r'[0-9]{1,}') # Match as regex


In [None]:
# .loc[row_name,column_name]
# .iloc[row_index,column_index]

In [None]:
# read_imdb_data.loc[read_imdb_data.title.str.contains('Time'),]

In [None]:
read_imdb_data.title

0                         Hot Tub Time Machine 2
1       The Princess Diaries 2: Royal Engagement
2                                       Whiplash
3                                        Kahaani
4                                     Marine Boy
                          ...                   
2995                                     Chasers
2996                            We Are the Best!
2997                     The Long Kiss Goodnight
2998                            Along Came Polly
2999                                   Abduction
Name: title, Length: 3000, dtype: object

In [None]:
# str.find('')
# read_imdb_data.title.str.find('h') # -1 if not found , otherwise first occurance.

0       16
1        1
2        1
3        2
4       -1
        ..
2995     1
2996     8
2997     1
2998    -1
2999    -1
Name: title, Length: 3000, dtype: int64

In [None]:
# str.replace(old,new)
# read_imdb_data.title.str.replace('Time','TIME') # Replace Exact match as substring
# read_imdb_data.title.str.replace(r'[0-9]{1,}','&', regex=True) # Match as regex

0                         Hot Tub Time Machine &
1       The Princess Diaries &: Royal Engagement
2                                       Whiplash
3                                        Kahaani
4                                     Marine Boy
                          ...                   
2995                                     Chasers
2996                            We Are the Best!
2997                     The Long Kiss Goodnight
2998                            Along Came Polly
2999                                   Abduction
Name: title, Length: 3000, dtype: object

In [None]:
read_imdb_data.title

0                         Hot Tub Time Machine 2
1       The Princess Diaries 2: Royal Engagement
2                                       Whiplash
3                                        Kahaani
4                                     Marine Boy
                          ...                   
2995                                     Chasers
2996                            We Are the Best!
2997                     The Long Kiss Goodnight
2998                            Along Came Polly
2999                                   Abduction
Name: title, Length: 3000, dtype: object

In [None]:
def get_3_value(x):
  if len(x) < 3:
    return x
  else:
    return x[2]

In [None]:
read_imdb_data.title.str.split(' ')

In [None]:
# str.split('delimiter')
read_imdb_data.title.str.split(' ').apply(get_3_value) # Splits the string and converts it to list

0                Time
1             Diaries
2          [Whiplash]
3           [Kahaani]
4       [Marine, Boy]
            ...      
2995        [Chasers]
2996              the
2997             Kiss
2998            Polly
2999      [Abduction]
Name: title, Length: 3000, dtype: object

In [None]:
df = pd.read_csv()
# df.drop_duplicates()
# run .strip for each categorical column

In [None]:
# str.strip() --> Strips the extra space
read_imdb_data.title.str.strip()

0                         Hot Tub Time Machine 2
1       The Princess Diaries 2: Royal Engagement
2                                       Whiplash
3                                        Kahaani
4                                     Marine Boy
                          ...                   
2995                                     Chasers
2996                            We Are the Best!
2997                     The Long Kiss Goodnight
2998                            Along Came Polly
2999                                   Abduction
Name: title, Length: 3000, dtype: object

In [None]:
# My email id is alok@febfwjf.com
# ravi@jefrjnfre;f.com is my email id.

In [None]:
read_imdb_data.title

0                         Hot Tub Time Machine 2
1       The Princess Diaries 2: Royal Engagement
2                                       Whiplash
3                                        Kahaani
4                                     Marine Boy
                          ...                   
2995                                     Chasers
2996                            We Are the Best!
2997                     The Long Kiss Goodnight
2998                            Along Came Polly
2999                                   Abduction
Name: title, Length: 3000, dtype: object

In [None]:
# str.contains('Time') and str.contains('Machine')

In [None]:
read_imdb_data.title.str.contains(r'([0-9]{1,})')

  read_imdb_data.title.str.contains(r'([0-9]{1,})')


0        True
1        True
2       False
3       False
4       False
        ...  
2995    False
2996    False
2997    False
2998    False
2999    False
Name: title, Length: 3000, dtype: bool

In [None]:
df1 = read_imdb_data.title.str.extract(r'([0-9]{1,})').reset_index()
df1.loc[df1.index ==2960,]

Unnamed: 0,index,0
2960,2960,4


In [None]:
read_imdb_data.loc[21,]

id                                                                      22
belongs_to_collection    [{'id': 9735, 'name': 'Friday the 13th Collect...
budget                                                             4000000
genres                   [{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...
homepage                                                               NaN
imdb_id                                                          tt0083972
original_language                                                       en
original_title                                    Friday the 13th Part III
overview                 An idyllic summer turns into a nightmare of un...
popularity                                                         7.99229
poster_path                               /5wg2NZyIhcMbIBAahBODXHyJ54S.jpg
production_companies     [{'name': 'Paramount Pictures', 'id': 4}, {'na...
production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...
release_date             

In [None]:
data = [{'name':'John', 'about':'My contact detail is 9876543210, Call me for the house repair'},
        {'name':'Albert','about':'4567890123 is my contact detail, you can contact me for the problem with electricty, my alternate number is 6789012343'}]

df = pd.DataFrame(data)
df

Unnamed: 0,name,about
0,John,"My contact detail is 9876543210, Call me for t..."
1,Albert,"4567890123 is my contact detail, you can conta..."


In [None]:
df.about.str.extract(r'([0-9]{10})')

Unnamed: 0,0
0,9876543210
1,4567890123


In [None]:
df.about.str.extractall(r'([0-9]{10})')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,9876543210
1,0,4567890123
1,1,6789012343


In [None]:
# str.extract() and str.extractall() --> Extracts the substring based on regex pattern
# read_imdb_data.title.str.extract(r'([0-9]{1,})') # Extracts first match
read_imdb_data.title.str.extractall(r'([0-9]{1,})').reset_index() # Extracts all matches

Unnamed: 0,level_0,match,0
0,0,0,2
1,1,0,2
2,19,0,2
3,21,0,13
4,33,0,2
...,...,...,...
202,2958,1,3
203,2960,0,4
204,2960,1,3
205,2960,2,2


In [None]:
read_imdb_data.loc[2960,'title']

'4.3.2.1'

##### Datetime

In [None]:
# Converting String to Datetime
# Extracting components from the datetime
# Create a timedelta object representing a duration : Adding or substracting some year,months, days, hours,minutes,etc
# Subtracting two datetime
# Putting filter on datetime


In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,english,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  604 non-null    object 
 2   budget                 3000 non-null   int64  
 3   genres                 2993 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   2844 non-null   object 
 12  production_countries   2945 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

In [None]:
read_imdb_data['release_date'].head(5)

0     2/20/15
1      8/6/04
2    10/10/14
3      3/9/12
4      2/5/09
Name: release_date, dtype: object

In [None]:
data = [{'index':0,'date':'01-09-24'},{'index':1,'date':'01-11-22'}]
df = pd.DataFrame(data)
df

Unnamed: 0,index,date
0,0,01-09-24
1,1,01-11-22


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   2 non-null      int64 
 1   date    2 non-null      object
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes


In [None]:
df.date = pd.to_datetime(df.date,format= '%d-%m-%y')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   index   2 non-null      int64         
 1   date    2 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 160.0 bytes


In [None]:
# Converting String to Datetime
# .to_datetime
# Ref : https://www.geeksforgeeks.org/python-datetime-strptime-function/

read_imdb_data.release_date = pd.to_datetime(read_imdb_data.release_date,format= '%m/%d/%y')


In [None]:
read_imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     3000 non-null   int64         
 1   belongs_to_collection  604 non-null    object        
 2   budget                 3000 non-null   int64         
 3   genres                 2993 non-null   object        
 4   homepage               946 non-null    object        
 5   imdb_id                3000 non-null   object        
 6   original_language      3000 non-null   object        
 7   original_title         3000 non-null   object        
 8   overview               2992 non-null   object        
 9   popularity             3000 non-null   float64       
 10  poster_path            2999 non-null   object        
 11  production_companies   2844 non-null   object        
 12  production_countries   2945 non-null   object        
 13  rel

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2015-02-20,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.release_date

0      2015-02-20
1      2004-08-06
2      2014-10-10
3      2012-03-09
4      2009-02-05
          ...    
2995   1994-04-22
2996   2013-03-28
2997   1996-10-11
2998   2004-01-16
2999   2011-09-22
Name: release_date, Length: 3000, dtype: datetime64[ns]

In [None]:
# Extracting components from the datetime
# dt. --> day,date, year, month,week, hour, day_name(), dayofyear,weekday(Monday=0, Sunday=6)
read_imdb_data['release_day'] = read_imdb_data.release_date.dt.day_name()

In [None]:
read_imdb_data['release_day'].head()

0      Friday
1      Friday
2      Friday
3      Friday
4    Thursday
Name: release_day, dtype: object

In [None]:
read_imdb_data.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue',
       'release_day'],
      dtype='object')

In [None]:
# read_imdb_data.loc[:,['belongs_to_collection','id',  'budget', 'genres', 'homepage',
#        'imdb_id', 'original_language', 'original_title', 'overview',
#        'popularity', 'poster_path', 'production_companies',
#        'production_countries', 'release_date', 'runtime', 'spoken_languages',
#        'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue',
#        'release_day']]

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,release_day
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,Friday


In [None]:
read_imdb_data.release_date.head(1)

0   2015-02-20
Name: release_date, dtype: datetime64[ns]

In [None]:
read_imdb_data.release_date

0      2015-02-20
1      2004-08-06
2      2014-10-10
3      2012-03-09
4      2009-02-05
          ...    
2995   1994-04-22
2996   2013-03-28
2997   1996-10-11
2998   2004-01-16
2999   2011-09-22
Name: release_date, Length: 3000, dtype: datetime64[ns]

In [None]:
# Create a timedelta object representing a duration : Adding or substracting some [weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]
# pd.Timedelta(days=2)
read_imdb_data.release_date - pd.Timedelta(weeks = 1)

0      2015-02-13
1      2004-07-30
2      2014-10-03
3      2012-03-02
4      2009-01-29
          ...    
2995   1994-04-15
2996   2013-03-21
2997   1996-10-04
2998   2004-01-09
2999   2011-09-15
Name: release_date, Length: 3000, dtype: datetime64[ns]

In [None]:
read_imdb_data.loc[:,'release_date_plus2d'] = read_imdb_data.release_date + pd.Timedelta(days = 2)

In [None]:
(read_imdb_data.release_date_plus2d - read_imdb_data.release_date).dt.days

0       2
1       2
2       2
3       2
4       2
       ..
2995    2
2996    2
2997    2
2998    2
2999    2
Length: 3000, dtype: int64

In [None]:
# Subtracting two datetime
read_imdb_data.loc[:,'date_diff'] = (read_imdb_data.release_date_plus2d - read_imdb_data.release_date).dt.days

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,release_date_plus2d,date_diff
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,2015-02-22,2


In [None]:
# Putting filter on datetime
read_imdb_data.loc[read_imdb_data.release_date > '2019-01-01',].sort_values(by='release_date', ascending = True)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,release_day,release_date_plus2d
1763,1764,,250000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0012349,en,The Kid,Considered one of Charlie Chaplin's best films...,8.168456,...,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,6 reels of Joy.,The Kid,"[{'id': 290, 'name': 'angel'}, {'id': 1252, 'n...","[{'cast_id': 10, 'character': 'A Tramp', 'cred...","[{'credit_id': '52fe43269251416c75005605', 'de...",2500000,Thursday,2021-01-23
2992,2993,,1135654,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,tt0015400,en,The Thief of Bagdad,A recalcitrant thief vies with a duplicitous M...,3.878515,...,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,"""Happiness Must Be Earned""",The Thief of Bagdad,"[{'id': 255, 'name': 'male nudity'}, {'id': 14...","[{'cast_id': 3, 'character': 'The Thief of Bag...","[{'credit_id': '52fe45bec3a368484e06c70b', 'de...",1213880,Monday,2024-03-20
1917,1918,,592,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,tt0016104,en,The Merry Widow,Prince Danilo falls in love with dancer Sally ...,0.286719,...,,Released,,The Merry Widow,"[{'id': 1691, 'name': 'dance'}, {'id': 10181, ...","[{'cast_id': 2, 'character': ""Sally O'Hara"", '...","[{'credit_id': '57351170c3a36802410000d5', 'de...",1,Tuesday,2025-08-28
1361,1362,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,tt0015648,ru,Броненосец «Потёмкин»,A dramatized account of a great Russian naval ...,12.912845,...,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,"Revolution is the only lawful, equal, effectua...",Battleship Potemkin,"[{'id': 1816, 'name': 'staircase'}, {'id': 323...","[{'cast_id': 8, 'character': 'Grigory Vakulinc...","[{'credit_id': '52fe4263c3a36847f801a8cf', 'de...",45100,Wednesday,2025-12-26
2680,2681,,463455,"[{'id': 18, 'name': 'Drama'}]",,tt0017423,en,Sparrows,Evil Mr.Grimes keeps a rag-tag bunch orphans o...,0.445526,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Sparrows,"[{'id': 1930, 'name': 'kidnapping'}, {'id': 49...","[{'cast_id': 2, 'character': 'Molly', 'credit_...","[{'credit_id': '52fe48a1c3a368484e102bd9', 'de...",966878,Thursday,2026-05-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034,1035,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,tt0063518,en,Romeo and Juliet,Director Franco Zeffirelli's beloved version o...,6.990990,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No ordinary love story...,Romeo and Juliet,"[{'id': 255, 'name': 'male nudity'}, {'id': 16...","[{'cast_id': 6, 'character': 'Romeo', 'credit_...","[{'credit_id': '52fe4435c3a36847f80889c3', 'de...",38901218,Monday,2068-04-04
2147,2148,"[{'id': 264338, 'name': ""Rosemary's Baby Colle...",3200000,"[{'id': 27, 'name': 'Horror'}, {'id': 18, 'nam...",,tt0063522,en,Rosemary's Baby,A young couple moves into an infamous New York...,11.504558,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pray for Rosemary's Baby,Rosemary's Baby,"[{'id': 8685, 'name': 'anti-christ'}, {'id': 1...","[{'cast_id': 15, 'character': 'Rosemary Woodho...","[{'credit_id': '52fe4279c3a36847f8021573', 'de...",33395426,Tuesday,2068-06-14
140,141,,14320000,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt0063642,en,Star!,Gertrude Lawrence rises to stage stardom at th...,0.585397,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Happiness is a girl called Julie!,Star!,"[{'id': 4344, 'name': 'musical'}, {'id': 12990...","[{'cast_id': 3, 'character': 'Gertrude Lawrenc...","[{'credit_id': '52fe484bc3a36847f81602c9', 'de...",14000000,Wednesday,2068-07-20
647,648,,9000000,"[{'id': 878, 'name': 'Science Fiction'}]",,tt0062711,en,Barbarella,"In the far future, a highly sexual woman is ta...",14.304205,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,See Barbarella do her thing!,Barbarella,"[{'id': 9835, 'name': 'sexual fantasy'}, {'id'...","[{'cast_id': 2, 'character': 'Barbarella', 'cr...","[{'credit_id': '52fe4491c3a36847f809d5ab', 'de...",2500000,Wednesday,2068-10-12


##### Dealing with JSON

In [None]:
# pd.json_normalize()

In [None]:
pd.json_normalize([{'a':1,'b':4},{'a':9,'b':7}])

Unnamed: 0,a,b
0,1,4
1,9,7


In [None]:
# pd.DataFrame([{'a':1,'b':4},{'a':9,'b':7}])

Unnamed: 0,a,b
0,1,4
1,9,7


In [None]:
pd.json_normalize([{'a':1,'b':{'c':2,'d':3}},{'a':9,'b':{'c':10,'d':11,'e':12}}])

Unnamed: 0,a,b.c,b.d,b.e
0,1,2,3,
1,9,10,11,12.0


In [None]:
# pd.json_normalize([{'a':1,'b':{'c':2,'d':3}},{'a':9,'b':{'c':10,'d':11,'e':12}}])

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,release_date_plus2d,date_diff
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,2015-02-22,2


In [None]:
read_imdb_data.genres[0]

"[{'id': 35, 'name': 'Comedy'}]"

In [None]:
pd.json_normalize(eval(read_imdb_data.genres[0]))

Unnamed: 0,id,name
0,35,Comedy


In [None]:
# Steps to take :
# 1. First implement eval to convert from str to Json (list of dict)
# 2. pd.json_normalize to make it dataframe
# 3. get the name column as series, convert to list and join comma separated

In [None]:
read_imdb_data.genres[0]

"[{'id': 35, 'name': 'Comedy'}]"

In [None]:
# extract Genre for the movies
# Custom function
def parse_json(s,key):
  try:
    j = eval(s) # list of dictionaries
    return ','.join(list(pd.json_normalize(j)[key]))
  except:
    return s

In [None]:
read_imdb_data.head(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435


In [None]:
# list(read_imdb_data.genres[1])

In [None]:
j = eval(read_imdb_data.genres[1])

In [None]:
pd.json_normalize(j)

Unnamed: 0,id,name
0,35,Comedy
1,18,Drama
2,10751,Family
3,10749,Romance


In [None]:
list(pd.json_normalize(j)['name'])

['Comedy', 'Drama', 'Family', 'Romance']

In [None]:
# Comedy,Drama,Family,Romance
','.join(list(pd.json_normalize(j)['name']))

'Comedy,Drama,Family,Romance'

In [None]:
# ','.join(list(pd.json_normalize(j)['name']))

'Comedy,Drama,Family,Romance'

In [None]:
# ','.join(list(pd.json_normalize(j)[key]))

In [None]:
# parse_json(s = read_imdb_data.genres[0],key = 'name')

'Comedy'

In [None]:
read_imdb_data.genres.apply(parse_json,key = 'name')

0                              Comedy
1         Comedy,Drama,Family,Romance
2                               Drama
3                      Thriller,Drama
4                     Action,Thriller
                    ...              
2995                   Comedy,Romance
2996                      Drama,Music
2997    Crime,Action,Mystery,Thriller
2998                   Comedy,Romance
2999          Thriller,Action,Mystery
Name: genres, Length: 3000, dtype: object

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.spoken_languages.head(10)

0             [{'iso_639_1': 'en', 'name': 'English'}]
1             [{'iso_639_1': 'en', 'name': 'English'}]
2             [{'iso_639_1': 'en', 'name': 'English'}]
3    [{'iso_639_1': 'en', 'name': 'English'}, {'iso...
4             [{'iso_639_1': 'ko', 'name': '한국어/조선말'}]
5             [{'iso_639_1': 'en', 'name': 'English'}]
6             [{'iso_639_1': 'en', 'name': 'English'}]
7    [{'iso_639_1': 'ar', 'name': 'العربية'}, {'iso...
8             [{'iso_639_1': 'en', 'name': 'English'}]
9             [{'iso_639_1': 'en', 'name': 'English'}]
Name: spoken_languages, dtype: object

In [None]:
read_imdb_data.spoken_languages.apply(parse_json,key = 'iso_639_1')

0          en
1          en
2          en
3       en,hi
4          ko
        ...  
2995       en
2996       sv
2997       en
2998       en
2999       en
Name: spoken_languages, Length: 3000, dtype: object

In [None]:
# read_imdb_data.cast.apply(parse_json,key = 'name')

0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 3000, dtype: object

### Reshaping data

In [None]:
# .pivot(), .crosstab(), .astype()

In [None]:
read_imdb_data.head(1)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651


In [None]:
read_imdb_data.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [None]:
read_imdb_data.groupby(['original_language','release_date']).agg({'runtime':'mean','revenue':'mean'}).reset_index().sort_values('original_language')

Unnamed: 0,original_language,release_date,runtime,revenue
0,english,1969-01-01,149.0,20000000.0
1422,english,2010-04-23,101.5,38750433.5
1421,english,2010-04-02,92.0,39340177.0
1420,english,2010-04-01,118.0,3878993.0
1419,english,2010-03-22,117.0,96188903.0
...,...,...,...,...
2278,others,2005-05-02,117.0,36000000.0
2277,others,2005-03-05,126.0,8262833.0
2276,others,2004-12-23,72.0,1730000.0
2285,others,2005-08-04,133.0,33579813.0


In [None]:
# df.pivot(index='col1', columns='col2', values=['col']) # note: index & column must be unique
# Function used in combination with group by

In [None]:
read_imdb_data.groupby(['original_language','release_date']).agg({'revenue':'mean'}).reset_index()

Unnamed: 0,original_language,release_date,revenue
0,english,1969-01-01,20000000.0
1,english,1969-06-17,638641.0
2,english,1969-09-23,102308889.0
3,english,1969-12-12,33208099.0
4,english,1969-12-18,6000000.0
...,...,...,...
2524,others,2060-05-24,8000000.0
2525,others,2064-08-05,182857.0
2526,others,2066-04-05,17277.0
2527,others,2066-09-08,921548.0


In [None]:
read_imdb_data_grpd_pivot

Unnamed: 0_level_0,release_date,revenue,revenue,revenue
original_language,Unnamed: 1_level_1,english,hindi,others
0,1969-01-01,20000000.0,,
1,1969-06-17,638641.0,,
2,1969-09-10,,,741766.0
3,1969-09-23,102308889.0,,
4,1969-12-12,33208099.0,,
...,...,...,...,...
2393,2068-04-02,38901218.0,,
2394,2068-06-12,33395426.0,,
2395,2068-07-18,14000000.0,,
2396,2068-10-10,2500000.0,,


In [None]:
read_imdb_data.groupby(['original_language','release_date']).agg({'revenue':'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,revenue
original_language,release_date,Unnamed: 2_level_1
english,1969-01-01,20000000.0
english,1969-06-17,638641.0
english,1969-09-23,102308889.0
english,1969-12-12,33208099.0
english,1969-12-18,6000000.0
...,...,...
others,2060-05-24,8000000.0
others,2064-08-05,182857.0
others,2066-04-05,17277.0
others,2066-09-08,921548.0


In [None]:
read_imdb_data_grpd = read_imdb_data.groupby(['original_language','release_date']).agg({'revenue':'mean'}).reset_index()

In [None]:
read_imdb_data_grpd_pivot = read_imdb_data_grpd.pivot(index = 'release_date',columns = 'original_language', values = [	'revenue']).reset_index()

In [None]:
read_imdb_data_grpd_pivot

Unnamed: 0_level_0,release_date,revenue,revenue,revenue
original_language,Unnamed: 1_level_1,english,hindi,others
0,1969-01-01,20000000.0,,
1,1969-06-17,638641.0,,
2,1969-09-10,,,741766.0
3,1969-09-23,102308889.0,,
4,1969-12-12,33208099.0,,
...,...,...,...,...
2393,2068-04-02,38901218.0,,
2394,2068-06-12,33395426.0,,
2395,2068-07-18,14000000.0,,
2396,2068-10-10,2500000.0,,


In [None]:
read_imdb_data_grpd_pivot.columns

MultiIndex([('release_date',        ''),
            (     'runtime', 'english'),
            (     'runtime',   'hindi'),
            (     'runtime',  'others'),
            (     'revenue', 'english'),
            (     'revenue',   'hindi'),
            (     'revenue',  'others')],
           names=[None, 'original_language'])

In [None]:
read_imdb_data_grpd_pivot.head()

Unnamed: 0_level_0,release_date,runtime,runtime,runtime,revenue,revenue,revenue
original_language,Unnamed: 1_level_1,english,hindi,others,english,hindi,others
0,1969-01-01,149.0,,,20000000.0,,
1,1969-06-17,145.0,,,638641.0,,
2,1969-09-10,,,145.0,,,741766.0
3,1969-09-23,110.0,,,102308889.0,,
4,1969-12-12,146.0,,,33208099.0,,


In [None]:
read_imdb_data_grpd_pivot.loc[:30,[('runtime', 'others'),('revenue','hindi')]]

Unnamed: 0_level_0,runtime,revenue
original_language,others,hindi
0,,
1,,
2,145.0,
3,,
4,,
5,,
6,,
7,,
8,,
9,126.0,


In [None]:
# Accuracy --> Actual vs Predicted
# heatmap --> For categorical vs categorical Variable

In [None]:
read_imdb_data.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,english,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2015-02-20,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,english,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,2004-08-06,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,english,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,2014-10-10,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hindi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,2012-03-09,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,others,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2009-02-05,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [None]:
read_imdb_data.groupby(['release_date','original_language'])['id'].count().reset_index()

Unnamed: 0,release_date,original_language,id
0,1969-01-01,english,1
1,1969-06-17,english,1
2,1969-09-10,others,1
3,1969-09-23,english,1
4,1969-12-12,english,1
...,...,...,...
2524,2068-04-02,english,1
2525,2068-06-12,english,1
2526,2068-07-18,english,1
2527,2068-10-10,english,1


In [None]:
# .crosstab() -->
pd.crosstab(read_imdb_data.release_date,read_imdb_data.original_language ).reset_index()

original_language,release_date,english,hindi,others
0,1969-01-01,1,0,0
1,1969-06-17,1,0,0
2,1969-09-10,0,0,1
3,1969-09-23,1,0,0
4,1969-12-12,1,0,0
...,...,...,...,...
2393,2068-04-02,1,0,0
2394,2068-06-12,1,0,0
2395,2068-07-18,1,0,0
2396,2068-10-10,1,0,0


In [None]:
read_imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     3000 non-null   int64         
 1   belongs_to_collection  604 non-null    object        
 2   budget                 3000 non-null   int64         
 3   genres                 2993 non-null   object        
 4   homepage               946 non-null    object        
 5   imdb_id                3000 non-null   object        
 6   original_language      3000 non-null   object        
 7   original_title         3000 non-null   object        
 8   overview               2992 non-null   object        
 9   popularity             3000 non-null   float64       
 10  poster_path            2999 non-null   object        
 11  production_companies   2844 non-null   object        
 12  production_countries   2945 non-null   object        
 13  rel

In [None]:
# .astype() --> Typecasting
# Int to str
read_imdb_data.id.astype('str').astype('int')

0          1
1          2
2          3
3          4
4          5
        ... 
2995    2996
2996    2997
2997    2998
2998    2999
2999    3000
Name: id, Length: 3000, dtype: int64

In [None]:
read_imdb_data['date_concat'] = read_imdb_data.release_date.dt.year.astype('str') + '-' + read_imdb_data.release_date.dt.month.astype('str')

In [None]:
read_imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     3000 non-null   int64         
 1   belongs_to_collection  604 non-null    object        
 2   budget                 3000 non-null   int64         
 3   genres                 2993 non-null   object        
 4   homepage               946 non-null    object        
 5   imdb_id                3000 non-null   object        
 6   original_language      3000 non-null   object        
 7   original_title         3000 non-null   object        
 8   overview               2992 non-null   object        
 9   popularity             3000 non-null   float64       
 10  poster_path            2999 non-null   object        
 11  production_companies   2844 non-null   object        
 12  production_countries   2945 non-null   object        
 13  rel

In [None]:
read_imdb_data.date_concat[0]

'2015-2'

### Caching & Parallelization

In [None]:
#  Parallelization & Caching

#####  Parallelization

In [None]:
# Parallelization
!pip install pandarallel
from pandarallel import pandarallel

Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill>=0.3.1 (from pandarallel)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.5-py3-none-any.whl size=16673 sha256=965f1d064002ecfea78fb7d4139444b5d6d4681187e0e5121098f4340fc1be89
  Stored in directory: /root/.cache/pip/wheels/50/4f/1e/34e057bb868842209f1623f195b74fd7eda229308a7352d47f
Successfully built pandarallel
Installing collected packages: dill, pandarallel
Successfully installed dill-0.3.8 pandarallel-1.6.5


In [None]:
# pandarallel.initialize(nb_workers=1)

In [None]:
# %time  --> 1 execution and time taken
# %timeit --> 1000 executions and avg time taken

In [None]:
read_imdb_data = pd.concat([read_imdb_data]* 5)
read_imdb_data.shape

(15000, 23)

In [None]:
# Column with all links.parallel_apply(funciton)

In [None]:
pandarallel.initialize(nb_workers=1)
%time read_imdb_data.cast.parallel_apply(parse_json,key = 'name')

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 527 ms, sys: 139 ms, total: 666 ms
Wall time: 1min 36s


0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 15000, dtype: object

In [None]:
pandarallel.initialize(nb_workers = 2)
%time read_imdb_data.cast.parallel_apply(parse_json,key = 'name')

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 316 ms, sys: 138 ms, total: 455 ms
Wall time: 57.6 s


0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 15000, dtype: object

In [None]:
# pandarallel.initialize(nb_workers = 4)
# %time read_imdb_data.cast.parallel_apply(parse_json,key = 'name')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 60.6 ms, sys: 80.1 ms, total: 141 ms
Wall time: 5.86 s


0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 3000, dtype: object

In [None]:
%time read_imdb_data.cast.apply(parse_json,key = 'name')

CPU times: user 45.8 s, sys: 304 ms, total: 46.1 s
Wall time: 46.5 s


0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 30000, dtype: object

##### Caching

In [None]:
# joblib
from joblib import Memory
mem = Memory(location='cache')

In [None]:
@mem.cache
def parse_json_cache(s,key):
  try:
    j= eval(s)
    return ','.join(list(pd.json_normalize(j)[key]))
  except:
    return s

In [None]:
%time read_imdb_data.cast.apply(parse_json_cache,key = 'name') #First time run

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
_________________________________________________parse_json_cache - 0.0s, 0.0min
________________________________________________________________________________
[Memory] Calling __main__--content-drive-MyDrive-M2_Project-<ipython-input-7067ccf78c42>.parse_json_cache...
parse_json_cache(("[{'cast_id': 11, 'character': 'Caesar', 'credit_id': "
 "'52fe430ec3a36847f8037199', 'gender': 2, 'id': 7505, 'name': 'Roddy "
 "McDowall', 'order': 0, 'profile_path': '/e0OZn5SUXHghdmAgJbF3uLHD5gf.jpg'}, "
 "{'cast_id': 21, 'character': 'Lisa', 'credit_id': "
 "'52fe430ec3a36847f80371bf', 'gender': 1, 'id': 18648, 'name': 'Natalie "
 "Trundy', 'order': 1, 'profile_path': '/50ljrtCFeFQRFURoSgf1hrgdCPw.jpg'}, "
 "{'cast_id': 23, 'character': 'MacDonald', 'credit_id': "
 "'52fe430ec3a36847f80371c7', 'gender': 2, 'id': 103707, 'name': 'Austin "
 "Stoker', 'order': 2, 'profile_path': '/ukRtXDLGlmaSE4lAruhAqG7IsLJ.jpg'}, "
 "{'cast_id': 26, '

  return self._cached_call(args, kwargs, shelving=False)[0]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
parse_json_cache(("[{'cast_id': 1, 'character': 'Blaire Lilly', 'credit_id': "
 "'53d8e6ec0e0a261c840023c5', 'gender': 1, 'id': 444211, 'name': 'Shelley "
 "Hennig', 'order': 0, 'profile_path': '/7o9Cw2YO63zsQBlFkdbKn3YcUvQ.jpg'}, "
 "{'cast_id': 17, 'character': 'Mitch Roussel', 'credit_id': "
 "'5464324ac3a36804b4000b8b', 'gender': 0, 'id': 1385058, 'name': 'Moses Jacob "
 "Storm', 'order': 1, 'profile_path': '/mFnBuVTT6CjQ4GYAOJpsSeBN8KS.jpg'}, "
 '{\'cast_id\': 2, \'character\': \'Jessica "Jess" Felton\', \'credit_id\': '
 "'53d8e6f60e0a261c7e0024b2', 'gender': 1, 'id': 155621, 'name': 'Renee "
 "Olstead', 'order': 2, 'profile_path': '/Alru3bM9sSE0Q6yeN5fLwNxIy8v.jpg'}, "
 "{'cast_id': 14, 'character': '..., key='name')
_________________________________________________parse_json_cache - 0.0s, 0.0min
________________________________________________________________________________
[Memory] Calling __main__--content-driv

0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 15000, dtype: object

In [None]:
%time read_imdb_data.cast.apply(parse_json_cache,key = 'name') # load from the saved cache

CPU times: user 17.1 s, sys: 6.09 s, total: 23.2 s
Wall time: 1min 58s


0       Rob Corddry,Craig Robinson,Clark Duke,Adam Sco...
1       Anne Hathaway,Julie Andrews,H√©ctor Elizondo,J...
2       Miles Teller,J.K. Simmons,Melissa Benoist,Aust...
3       Vidya Balan,Nawazuddin Siddiqui,Parambrata Cha...
4       Kim Kang-woo,Jo Jae-hyeon,Park Si-yeon,Kim Joo...
                              ...                        
2995    Tom Berenger,William McNamara,–≠—Ä–∏–∫–∞ –≠–ª–...
2996    Mira Barkhammar,Mira Grosin,Liv LeMoyne,David ...
2997    Geena Davis,Samuel L. Jackson,Yvonne Zima,Crai...
2998    Ben Stiller,Jennifer Aniston,Philip Seymour Ho...
2999    Taylor Lautner,Lily Collins,Alfred Molina,Jaso...
Name: cast, Length: 15000, dtype: object

### Writing data

In [None]:
os.getcwd()

'/content/drive/MyDrive/M2_Project'

In [None]:
# Writing to csv
read_imdb_data.to_csv('read_imdb_data_w.csv', index=False) # Index = False to avoid the index.

In [None]:
# Writing to excel

In [None]:
# df
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [None]:
# df2
data = {'Name': ['John', 'Hop', 'Titan'],
        'Age': [40, 35, 55]}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Name,Age
0,John,40
1,Hop,35
2,Titan,55


In [None]:
with pd.ExcelWriter('dataframe.xlsx') as writer:
    df.to_excel(writer, sheet_name='df')
    df2.to_excel(writer, sheet_name='df2', index = False)

In [None]:
df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [None]:
# Writing to Json

# Convert DataFrame to JSON with 'records' orient
json_str = df.to_json(orient='records')

print(json_str)

[{"Name":"Alice","Age":25},{"Name":"Bob","Age":30},{"Name":"Charlie","Age":35}]


In [None]:
json_str = df.to_json(orient='columns')

print(json_str)

{"Name":{"0":"Alice","1":"Bob","2":"Charlie"},"Age":{"0":25,"1":30,"2":35}}


In [None]:
# Convert DataFrame to HTML
html_table = df.to_html(index=False)

# Print or save the HTML table
print(html_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Name</th>
      <th>Age</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Alice</td>
      <td>25</td>
    </tr>
    <tr>
      <td>Bob</td>
      <td>30</td>
    </tr>
    <tr>
      <td>Charlie</td>
      <td>35</td>
    </tr>
  </tbody>
</table>
