# 1. Foundations

## 1.1 DataFrame anatomy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
movie = pd.read_csv("C:/Users/Divya/OneDrive/Desktop/Study/Pandas-Cookbook-master/data/movie.csv")

In [3]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## 1.2 Use the dataframe attributes index, columns and values to assign the index, columns and data to their own variables

In [4]:
index = movie.index
columns = movie.columns
data = movie.values

## Display each component's values

In [5]:
index

RangeIndex(start=0, stop=4916, step=1)

In [6]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [7]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

## Output the type of each DataFrame component

In [8]:
type(index)

pandas.core.indexes.range.RangeIndex

In [9]:
type(columns)

pandas.core.indexes.base.Index

In [10]:
type(data)

numpy.ndarray

## type of index and columns are closely related and from a same subclass, we can check with issubclass method, returns True

In [11]:
issubclass(pd.RangeIndex, pd.Index)

True

## 1.3 check the datatype of DataFrame

In [12]:
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

## get the count of each data type in a dataframe

In [13]:
movie.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

## 1.4 Selecting a single column of data as a Series

In [14]:
movie['title_year']

0       2009.0
1       2007.0
2       2015.0
3       2012.0
4          NaN
         ...  
4911    2013.0
4912       NaN
4913    2013.0
4914    2012.0
4915    2004.0
Name: title_year, Length: 4916, dtype: float64

below method is prone to error, example: if column name is similar to the pandas method name, e-g- count, or col name has spaces in it

In [15]:
movie.title_year

0       2009.0
1       2007.0
2       2015.0
3       2012.0
4          NaN
         ...  
4911    2013.0
4912       NaN
4913    2013.0
4914    2012.0
4915    2004.0
Name: title_year, Length: 4916, dtype: float64

 Verify that the output is the series

In [16]:
type(movie['title_year'])

pandas.core.series.Series

In [17]:
title = movie['color']
title.name

'color'

#### convert title series of color column into 1 column dataframe 

In [18]:
title.to_frame()

Unnamed: 0,color
0,Color
1,Color
2,Color
3,Color
4,
...,...
4911,Color
4912,Color
4913,Color
4914,Color


## 1.5 CALLING SERIES METHODS

Check the number of methods for Pandas Series

In [19]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

422

check the number of methods for Pandas Dataframe

In [20]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

437

check the number of common methods in Pandas Series and Dataframe

In [21]:
len(s_attr_methods & df_attr_methods)

367

### 1. Select 2 series with different data types director_name and actor_1_facebook_likes

In [22]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

### 2. Inspect head of each series

In [23]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [24]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

### 3. Count all the occurrences of each unique value in a series, most useful method for object data type is value_counts()

In [25]:
director.value_counts()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: director_name, Length: 2397, dtype: int64

### 4. sometimes value_counts method on numeric series also provide some insights occassionaly

In [26]:
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
703.0        1
208.0        1
79.0         1
269.0        1
291.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

### 5. Counting the number of elements in a series

In [27]:
director.size

4916

In [28]:
director.shape

(4916,)

In [29]:
len(director)

4916

In [30]:
director.size == len(director)

True

### 6. Count the number of non missing value with count method

In [31]:
director.count()

4814

In [32]:
actor_1_fb_likes.count()

4909

### 7. Basic summary statistics with min, max, mean, median, std and sum methods

In [33]:
actor_1_fb_likes.min(), actor_1_fb_likes.max(), actor_1_fb_likes.mean(), actor_1_fb_likes.median(), actor_1_fb_likes.std(), \
actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)

### 8. summary statistics with describe method

In [34]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

 when describe method used in object data type column

In [35]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

### 9. Calculate Quantile

In [36]:
actor_1_fb_likes.quantile(0.2)

510.0

In [37]:
actor_1_fb_likes.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [38]:
actor_1_fb_likes.sum()

31881444.0

### 10. check individual value if it null or not with .isnull() method

In [39]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [40]:
actor_1_fb_likes.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: actor_1_facebook_likes, Length: 4916, dtype: bool

In [41]:
actor_1_fb_likes.count() #there are 6 null values in it

4909

### 11. replace missing values with 0 by .fillna(0) method as 0 it's argument

In [42]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)

In [43]:
actor_1_fb_likes_filled.count() #notice that there are 0 null values now

4916

### 12. remove missing values with .dropna() method

In [44]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()

In [45]:
actor_1_fb_likes_dropped.size

4909

In [46]:
actor_1_fb_likes_dropped.count()

4909

### 13. vaue_counts() returns series of counts, with normalize paremeter set to True will return relative frequencies

In [47]:
director.value_counts(normalize=True)

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
Ridley Scott        0.003324
                      ...   
John Putch          0.000208
Luca Guadagnino     0.000208
Sam Fell            0.000208
Dan Fogelman        0.000208
Daniel Hsia         0.000208
Name: director_name, Length: 2397, dtype: float64

In [48]:
director.value_counts()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: director_name, Length: 2397, dtype: int64

In [49]:
26/4814, 22/4814 #count of unique values is divided by the count of non null values, gives relative frequency

(0.00540091400083091, 0.004570004154549232)

In [50]:
director.count()

4814

####  we determined that there were missing values in the Series by observing that the result from the count method did not match the size attribute

we can also check with .hasnans attribute, it will check the series and returns True if it contains missing values and False if it has no null values

In [51]:
director.hasnans

True

complement of .isnull() is .notnull(), it will return True if individual value is not null and False if it is null

In [52]:
director.notnull()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

In [53]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

## 1.6 working with operators in a series

In [54]:
5 in [1, 2, 5]

True

In [55]:
7 in [1, 2, 5]

False

In [56]:
set([1, 2, 4]) & set([2, 4, 5])

{2, 4}

Below code produces error

In [57]:
[1, 2, 3] - 1

TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [58]:
a = set([1, 2, 3])
a[0]

TypeError: 'set' object is not subscriptable

In [59]:
a

{1, 2, 3}

### 1. Select imdb_score column as series

In [60]:
imdb_score = movie["imdb_score"]
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

### 2. Add integer 1 to each series element

In [61]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [62]:
imdb_score.add(1)

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

### 3. multiply the series by 2.5

In [63]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [64]:
imdb_score.mul(2.5)

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

### 4. floor division with //

In [65]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [66]:
imdb_score.floordiv(7)

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

### 5. using comparison operator on a series

In [67]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

imdb_score.gt(7)

In [68]:
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [69]:
director.eq('James Cameron')

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

Operator Group Operator
 Arithmetic
 * Series method name
 * +, -, *, /, //, %, ** add, sub, mul, div, floordiv, mod, pow
 * Comparison
 * <, >, <=, >=, ==, != lt, gt, le, ge, eq, ne

## 1.7 Chaining Series methods together