# Pandas Intro

**pandas** is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. (https://pandas.pydata.org/)

In [1]:
import pandas as pd

## Series
![series](series.png)

The primary two building blocks of **pandas** are the `Series` and `DataFrame`.

A `Series` is essentially a column of data with
* a name
* a row index
* a datatype

In [5]:
data = [42, 53, 64, 75.5]
ser = pd.Series(data, name="sales", index=["john", "paul", "george", "ringo"])
ser

john      42.0
paul      53.0
george    64.0
ringo     75.5
Name: sales, dtype: float64

In [8]:
data_dict = {"XBX": 1989, "EP": 1912, "OX": 2022}
club_series = pd.Series(data_dict)

club_series

XBX    1989
EP     1912
OX     2022
dtype: int64

## DataFrame

A `DataFrame` is a collection of 1 or more Series, hence, a 2-dimensional table of data with
* a Series per column
* a shared index for all the columns
* a name

![](series-and-dataframe.png)

In [11]:
data = {
    "apples": [3, 2, 0, 1],
    "oranges": [0,3,7,2]
}

fruit_df = pd.DataFrame(data)
fruit_df

Unnamed: 0,apples,oranges
0,3,0
1,2,3
2,0,7
3,1,2


### Components of a `DataFrame`

In [12]:
# index
fruit_df.index

RangeIndex(start=0, stop=4, step=1)

In [13]:
# custom index
fruit_df = pd.DataFrame(data, index=[4, 8, 16, 32])
fruit_df

Unnamed: 0,apples,oranges
4,3,0
8,2,3
16,0,7
32,1,2


In [14]:
# data
fruit_df.to_numpy()

array([[3, 0],
       [2, 3],
       [0, 7],
       [1, 2]], dtype=int64)

In [15]:
# datatypes
fruit_df.dtypes

apples     int64
oranges    int64
dtype: object

## Reading in a csv

In [17]:
movies_df = pd.read_csv('movie.csv')
movies_df

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [18]:
# head (gets the first few rows for sample)
# putting numbers in method's parenthesis will give that number of data

movies_df.head(10)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [None]:
# sample


In [19]:
# shape (gives information of the number of columns and rows)

movies_df.shape

(4916, 28)

In [20]:
#columns
movies_df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

## Selecting a column

In [23]:
# index operator ([] notation)
directors = movies_df["director_name"]
directors

# attribute acccess (dot notation)
directors = movies_df.director_name
directors


0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

## Selecting rows and columns

In [45]:
# select row with index 0


color                                                                    Color
director_name                                                    James Cameron
num_critic_for_reviews                                                     723
duration                                                                   178
director_facebook_likes                                                      0
actor_3_facebook_likes                                                     855
actor_2_name                                                  Joel David Moore
actor_1_facebook_likes                                                    1000
gross                                                              7.60506e+08
genres                                         Action|Adventure|Fantasy|Sci-Fi
actor_1_name                                                       CCH Pounder
movie_title                                                             Avatar
num_voted_users                                     

In [46]:
# select row at location 3

color                                                                    Color
director_name                                                Christopher Nolan
num_critic_for_reviews                                                     813
duration                                                                   164
director_facebook_likes                                                  22000
actor_3_facebook_likes                                                   23000
actor_2_name                                                    Christian Bale
actor_1_facebook_likes                                                   27000
gross                                                              4.48131e+08
genres                                                         Action|Thriller
actor_1_name                                                         Tom Hardy
movie_title                                              The Dark Knight Rises
num_voted_users                                     

In [47]:
# select index 0 and direcor name

movies_df.loc[0, 'director_name']

'James Cameron'

In [48]:
# select index 0 and movie title

movies_df.loc[0, 'movie_title']

'Avatar'

In [49]:
# select all rows and column director name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

### `.value-counts()`

In [24]:
# get a count of all the directors
directors.value_counts()


Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: director_name, Length: 2397, dtype: int64

In [25]:
# size
directors.size

4916

In [26]:
# shape
directors.shape

(4916,)

In [59]:
# len()
len(directors)

4916

How many distinct directors are in the dataset?

In [62]:
# unique()

2398