# Pandas

Install Pandas using `pip install pandas` on terminal. 

import the library as pd (this is the standard)

In [1]:
import pandas as pd

- reading a csv file

In [2]:
df = pd.read_csv('movies.csv')

df - dataframe is a data structure provided by pandas to store 2d tabels with labels

- Display first five rows

In [4]:
df.head()

df.head(2) # displays the first 2 rows of the DataFrame 

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
0,Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English


- Display last five rows

In [5]:
df.tail() # displays the last 5 rows of the DataFrame

df.tail(3) # displays the last 3 rows of the DataFrame

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
34,Pushpa: The Rise - Part 1,Bollywood,2021,7.6,Mythri Movie Makers,2.0,3.6,Billions,INR,Telugu
35,RRR,Bollywood,2022,8.0,DVV Entertainment,5.5,12.0,Billions,INR,Telugu
36,Baahubali: The Beginning,Bollywood,2015,8.0,Arka Media Works,1.8,6.5,Billions,INR,Telugu


- Display random(sample) rows

In [7]:
df.sample()

df.sample(3) # displays 3 random rows from the DataFrame

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English
31,Race 3,Bollywood,2018,1.9,Salman Khan Films,1.8,3.1,Billions,INR,Hindi
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English


- Get rows and cols (shape)

In [13]:
df.shape # returns the number of rows and columns in the DataFrame as a tuple (rows, columns)

(37, 10)

- Display all cols

In [16]:
df.columns # returns the column names of the DataFrame as an Index object

Index(['title', 'industry', 'release_year', 'imdb_rating', 'studio', 'budget',
       'revenue', 'unit', 'currency', 'language'],
      dtype='str')

In [None]:
df.count() # returns the number of non-null values in each column of the DataFrame

title           37
industry        37
release_year    37
imdb_rating     36
studio          34
budget          37
revenue         37
unit            37
currency        37
language        37
dtype: int64

- find how many different types of industries we have in our movies df

In [10]:
df.industry.unique() # returns the unique values in the 'industry' column of the DataFrame

<StringArray>
['Bollywood', 'Hollywood']
Length: 2, dtype: str

- find how many movies in each

In [17]:
df.industry.value_counts() # returns the count of unique values in the 'industry' column of the DataFrame

industry
Hollywood    20
Bollywood    17
Name: count, dtype: int64

- Find min, max, mean, etc

In [14]:
df.imdb_rating.min() # returns the minimum value in the 'imdb_rating' column of the DataFrame
df.imdb_rating.max() # returns the maximum value in the 'imdb_rating' column of the DataFrame
df.imdb_rating.mean() # returns the mean (average) of the 'imdb_rating' column in the DataFrame
df.imdb_rating.median() # returns the median of the 'imdb_rating' column in the DataFrame

np.float64(8.1)

- Display movies released between 2000 and 2010

In [20]:
df[(df.release_year > 2000) & (df.release_year < 2010)] # filters the DataFrame to include only rows where the 'release_year' column is greater than 2000 and less than 2010

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
7,The Pursuit of Happyness,Hollywood,2006,8.0,Columbia Pictures,55.0,307.1,Millions,USD,English
11,Avatar,Hollywood,2009,7.8,20th Century Fox,237.0,2847.0,Millions,USD,English
13,The Dark Knight,Hollywood,2008,9.0,Syncopy,185.0,1006.0,Millions,USD,English
22,3 Idiots,Bollywood,2009,8.4,Vinod Chopra Films,550.0,4000.0,Millions,INR,Hindi
23,Kabhi Khushi Kabhie Gham,Bollywood,2001,7.4,Dharma Productions,390.0,1360.0,Millions,INR,Hindi
25,Taare Zameen Par,Bollywood,2007,8.3,,120.0,1350.0,Millions,INR,Hindi
26,Munna Bhai M.B.B.S.,Bollywood,2003,8.1,Vinod Chopra Productions,100.0,410.0,Millions,INR,Hindi


- Display all marvel movies

In [22]:
df[df.studio == 'Marvel Studios'] # filters the DataFrame to include only rows where the 'studio' column is equal to 'Marvel Studios'

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English
2,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English
3,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English
17,Avengers: Endgame,Hollywood,2019,8.4,Marvel Studios,400.0,2798.0,Millions,USD,English
18,Avengers: Infinity War,Hollywood,2018,8.4,Marvel Studios,400.0,2048.0,Millions,USD,English
19,Captain America: The First Avenger,Hollywood,2011,6.9,Marvel Studios,216.7,370.6,Millions,USD,English
20,Captain America: The Winter Soldier,Hollywood,2014,7.8,Marvel Studios,177.0,714.4,Millions,USD,English


- Add a new col called profit(difference between  revenue and budget)

In [26]:
df["profit"] = df.revenue - df.budget # creates a new column 'profit' by subtracting the 'budget' column from the 'revenue' column in the DataFrame
df.head()

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language,profit
0,Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali,30000.0
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English,754.8
2,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English,479.8
3,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English,674.0
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English,420.0


Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language,profit
0,Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali,30000.0
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English,754.8
2,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English,479.8
3,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English,674.0
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English,420.0


- Add a col called age (current year - release year)

In [31]:
df["age"] = 2026 - df.release_year # creates a new column 'age' by subtracting the 'release_year' column from the current year (2024) in the DataFrame
df.head()

Unnamed: 0,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language,profit,age
0,Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali,30000.0,71
1,Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English,754.8,4
2,Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English,479.8,13
3,Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English,674.0,9
4,Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English,420.0,4
