In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Dissecting the anatomy of a DataFrame

In [3]:
# .read_csv() est l'equivalent pandas de .csv() 
movie = pd.read_csv('master/data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## Accessing the main DataFrame components

In [4]:
index = movie.index
columns = movie.columns
data = movie.values

In [5]:
index

RangeIndex(start=0, stop=4916, step=1)

In [6]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [7]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [8]:
type(index)

pandas.core.indexes.range.RangeIndex

In [9]:
type(columns)

pandas.core.indexes.base.Index

In [10]:
type(data)

numpy.ndarray

In [11]:
issubclass(pd.RangeIndex, pd.Index)

True

In [12]:
# Les index pandas sont implémentés en utilisant des hash tables pour etre super rapides.

In [13]:
# Sous index, columns et data il y a des numpy array qui est l'objet de base à partir duquel sont formés
# de nombreux objets pandas, comme nous pouvons le constater:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915], dtype=int64)

In [14]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

 ## Understanding data types

In [15]:
# Affichons le type de données de chaque colonne
movie.dtypes
# apparement le dtype à la fin nous donne le type des éléments de la serie retournée.
# ici comme la serie est hétérogène(object, float64, int64), le type de la serie est object

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [16]:
movie.dtypes.index

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [17]:
movie.dtypes.values

array([dtype('O'), dtype('O'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'),
       dtype('float64'), dtype('O'), dtype('O'), dtype('O'),
       dtype('int64'), dtype('int64'), dtype('O'), dtype('float64'),
       dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'),
       dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('int64')], dtype=object)

In [18]:
movie.get_dtype_counts()
# ici la serie n'est composée que d'int64

float64    13
int64       3
object     12
dtype: int64

## Selecting a single column of data as a Series

In [19]:
movie['director_name']

0            James Cameron
1           Gore Verbinski
2               Sam Mendes
3        Christopher Nolan
4              Doug Walker
5           Andrew Stanton
6                Sam Raimi
7             Nathan Greno
8              Joss Whedon
9              David Yates
10             Zack Snyder
11            Bryan Singer
12            Marc Forster
13          Gore Verbinski
14          Gore Verbinski
15             Zack Snyder
16          Andrew Adamson
17             Joss Whedon
18            Rob Marshall
19        Barry Sonnenfeld
20           Peter Jackson
21               Marc Webb
22            Ridley Scott
23           Peter Jackson
24             Chris Weitz
25           Peter Jackson
26           James Cameron
27           Anthony Russo
28              Peter Berg
29         Colin Trevorrow
               ...        
4886            Eric Eason
4887              Uwe Boll
4888     Richard Linklater
4889       Joseph Mazzella
4890          Travis Legge
4891         Alex Kendrick
4

In [20]:
# or:
movie.director_name

0            James Cameron
1           Gore Verbinski
2               Sam Mendes
3        Christopher Nolan
4              Doug Walker
5           Andrew Stanton
6                Sam Raimi
7             Nathan Greno
8              Joss Whedon
9              David Yates
10             Zack Snyder
11            Bryan Singer
12            Marc Forster
13          Gore Verbinski
14          Gore Verbinski
15             Zack Snyder
16          Andrew Adamson
17             Joss Whedon
18            Rob Marshall
19        Barry Sonnenfeld
20           Peter Jackson
21               Marc Webb
22            Ridley Scott
23           Peter Jackson
24             Chris Weitz
25           Peter Jackson
26           James Cameron
27           Anthony Russo
28              Peter Berg
29         Colin Trevorrow
               ...        
4886            Eric Eason
4887              Uwe Boll
4888     Richard Linklater
4889       Joseph Mazzella
4890          Travis Legge
4891         Alex Kendrick
4

In [21]:
type(movie['director_name'])

pandas.core.series.Series

In [22]:
# notice that the old column name is now the name of the Series and has actually become an attribute:
director = movie['director_name']
director.name

'director_name'

In [23]:
# Turn the Series into a one-column DF.
# Here the method return a DF (which is not assigned to anything here) but director stays a Series
director.to_frame().head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


## Calling Series methods

In [24]:
# dir function list all the attributes and methods of a class

# NOTE ABOUT SETs: a set is a collection of items not in any particular order.No duplicates.
# The elements in the set are immutable(cannot be modified) but the set as a whole is mutable.
# There is no index attached to any element in a python set. So they do not support any indexing or slicing operation.
# We cannot access individual values in a set. We can only access all the elements together.
# But we can also get a list of individual elements by looping through the set.

s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

464

In [25]:
type(s_attr_methods)

set

In [26]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

460

In [27]:
len(s_attr_methods & df_attr_methods)

399

In [28]:
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [29]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [30]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [31]:
director.value_counts()
# To print the full Serie:
# print(director.value_counts().to_string())

Steven Spielberg        26
Woody Allen             22
Clint Eastwood          20
Martin Scorsese         20
Ridley Scott            16
Spike Lee               16
Steven Soderbergh       15
Renny Harlin            15
Tim Burton              14
Oliver Stone            14
Robert Zemeckis         13
Ron Howard              13
Barry Levinson          13
Joel Schumacher         13
Robert Rodriguez        13
Tony Scott              12
Michael Bay             12
Brian De Palma          12
Kevin Smith             12
Rob Reiner              11
Richard Linklater       11
Sam Raimi               11
Chris Columbus          11
Shawn Levy              11
Richard Donner          11
Francis Ford Coppola    11
John McTiernan          10
Wes Craven              10
John Carpenter          10
Stephen Frears          10
                        ..
Robert Cary              1
Christopher Landon       1
Jody Hill                1
Ben Lewin                1
James Dodson             1
Frank Nissen             1
C

In [32]:
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
13000.0    123
14000.0    120
10000.0    109
18000.0    106
22000.0     80
15000.0     71
23000.0     55
16000.0     55
4000.0      54
8000.0      51
17000.0     45
26000.0     39
20000.0     38
40000.0     36
21000.0     34
19000.0     31
5000.0      30
24000.0     29
49000.0     27
0.0         26
29000.0     20
6000.0      20
33000.0     18
826.0       17
34000.0     16
          ... 
458.0        1
77000.0      1
763.0        1
961.0        1
701.0        1
123.0        1
575.0        1
481.0        1
107.0        1
279.0        1
188.0        1
619.0        1
652.0        1
237.0        1
764.0        1
335.0        1
494.0        1
732.0        1
712.0        1
91.0         1
437.0        1
406.0        1
762.0        1
432.0        1
644.0        1
362.0        1
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [33]:
director.size

4916

In [34]:
director.shape

(4916,)

In [35]:
len(director)

4916

In [36]:
director.count()

4814

In [37]:
actor_1_fb_likes.count()

4909

In [38]:
# NOTES ABOUT RETURNS DISPLAY:
# Séparer les retours avec des virgules retourne un tuple.
# Revenir à la ligne sans crier gare retournera un autre tuple qui ecrasera le premier.
# Pour eviter cela on termine la ligne par un antislash ce qui indique que le tuple continue
# sur la ligne suivante.

# NOTE ABOUT TUPLES:
# Un tuple est une liste qui ne peut plus être modifiée.

actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
actor_1_fb_likes.std(), actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)

In [39]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [40]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [41]:
# quantile is flexible and returns a scalar value when passed a single value but returns a Series
# when given a list as parameter.
actor_1_fb_likes.quantile(0.2)

510.0

In [42]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6 , .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [43]:
director.isnull()
director.isnull().value_counts()

False    4814
True      102
Name: director_name, dtype: int64

In [44]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
actor_1_fb_likes_filled.count()

4916

In [45]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
actor_1_fb_likes_dropped.count()

4909

In [46]:
director.value_counts(normalize=True)

Steven Spielberg        0.005401
Woody Allen             0.004570
Clint Eastwood          0.004155
Martin Scorsese         0.004155
Ridley Scott            0.003324
Spike Lee               0.003324
Steven Soderbergh       0.003116
Renny Harlin            0.003116
Tim Burton              0.002908
Oliver Stone            0.002908
Robert Zemeckis         0.002700
Ron Howard              0.002700
Barry Levinson          0.002700
Joel Schumacher         0.002700
Robert Rodriguez        0.002700
Tony Scott              0.002493
Michael Bay             0.002493
Brian De Palma          0.002493
Kevin Smith             0.002493
Rob Reiner              0.002285
Richard Linklater       0.002285
Sam Raimi               0.002285
Chris Columbus          0.002285
Shawn Levy              0.002285
Richard Donner          0.002285
Francis Ford Coppola    0.002285
John McTiernan          0.002077
Wes Craven              0.002077
John Carpenter          0.002077
Stephen Frears          0.002077
          

In [47]:
director.hasnans

True

In [48]:
director.notnull()

0        True
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29       True
        ...  
4886     True
4887     True
4888     True
4889     True
4890     True
4891     True
4892     True
4893     True
4894     True
4895     True
4896     True
4897     True
4898     True
4899     True
4900     True
4901     True
4902     True
4903     True
4904     True
4905     True
4906     True
4907     True
4908     True
4909     True
4910     True
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

## Working with operators on a Series

In [49]:
imdb_score = movie['imdb_score']

In [50]:
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
5       6.6
6       6.2
7       7.8
8       7.5
9       7.5
10      6.9
11      6.1
12      6.7
13      7.3
14      6.5
15      7.2
16      6.6
17      8.1
18      6.7
19      6.8
20      7.5
21      7.0
22      6.7
23      7.9
24      6.1
25      7.2
26      7.7
27      8.2
28      5.9
29      7.0
       ... 
4886    7.0
4887    6.3
4888    7.1
4889    4.8
4890    3.3
4891    6.9
4892    4.6
4893    3.0
4894    6.6
4895    7.4
4896    6.2
4897    4.0
4898    6.1
4899    6.9
4900    7.5
4901    6.7
4902    7.4
4903    6.1
4904    5.4
4905    6.4
4906    7.0
4907    6.3
4908    6.9
4909    7.8
4910    6.4
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [51]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
5       7.6
6       7.2
7       8.8
8       8.5
9       8.5
10      7.9
11      7.1
12      7.7
13      8.3
14      7.5
15      8.2
16      7.6
17      9.1
18      7.7
19      7.8
20      8.5
21      8.0
22      7.7
23      8.9
24      7.1
25      8.2
26      8.7
27      9.2
28      6.9
29      8.0
       ... 
4886    8.0
4887    7.3
4888    8.1
4889    5.8
4890    4.3
4891    7.9
4892    5.6
4893    4.0
4894    7.6
4895    8.4
4896    7.2
4897    5.0
4898    7.1
4899    7.9
4900    8.5
4901    7.7
4902    8.4
4903    7.1
4904    6.4
4905    7.4
4906    8.0
4907    7.3
4908    7.9
4909    8.8
4910    7.4
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [52]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
5       16.50
6       15.50
7       19.50
8       18.75
9       18.75
10      17.25
11      15.25
12      16.75
13      18.25
14      16.25
15      18.00
16      16.50
17      20.25
18      16.75
19      17.00
20      18.75
21      17.50
22      16.75
23      19.75
24      15.25
25      18.00
26      19.25
27      20.50
28      14.75
29      17.50
        ...  
4886    17.50
4887    15.75
4888    17.75
4889    12.00
4890     8.25
4891    17.25
4892    11.50
4893     7.50
4894    16.50
4895    18.50
4896    15.50
4897    10.00
4898    15.25
4899    17.25
4900    18.75
4901    16.75
4902    18.50
4903    15.25
4904    13.50
4905    16.00
4906    17.50
4907    15.75
4908    17.25
4909    19.50
4910    16.00
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [53]:
# // floor division
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
5       0.0
6       0.0
7       1.0
8       1.0
9       1.0
10      0.0
11      0.0
12      0.0
13      1.0
14      0.0
15      1.0
16      0.0
17      1.0
18      0.0
19      0.0
20      1.0
21      1.0
22      0.0
23      1.0
24      0.0
25      1.0
26      1.0
27      1.0
28      0.0
29      1.0
       ... 
4886    1.0
4887    0.0
4888    1.0
4889    0.0
4890    0.0
4891    0.0
4892    0.0
4893    0.0
4894    0.0
4895    1.0
4896    0.0
4897    0.0
4898    0.0
4899    0.0
4900    1.0
4901    0.0
4902    1.0
4903    0.0
4904    0.0
4905    0.0
4906    1.0
4907    0.0
4908    0.0
4909    1.0
4910    0.0
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [54]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
5       False
6       False
7        True
8        True
9        True
10      False
11      False
12      False
13       True
14      False
15       True
16      False
17       True
18      False
19      False
20       True
21      False
22      False
23       True
24      False
25       True
26       True
27       True
28      False
29      False
        ...  
4886    False
4887    False
4888     True
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895     True
4896    False
4897    False
4898    False
4899    False
4900     True
4901    False
4902     True
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909     True
4910    False
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [55]:
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26       True
27      False
28      False
29      False
        ...  
4886    False
4887    False
4888    False
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895    False
4896    False
4897    False
4898    False
4899    False
4900    False
4901    False
4902    False
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909    False
4910    False
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [56]:
# Toutes ces opérations auraient recqueries de faire des for-loop dans python.
# Mais numpy et pandas, qui est basé sur numpy, acceptent ses opérations vectorielles directement.

In [57]:
# imdb_score * 2.5 = imdb_score.mul(2.5) = imdb_score.__mul__(2.5)

## Chaining Series methods together

In [58]:
# The backslash line \ is used as a continuation character when an expressions is broken on several lines.
# Alternatively, we can wrap a whole expression in parentheses.

In [59]:
director.value_counts().head()

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Ridley Scott        16
Name: director_name, dtype: int64

In [60]:
# common way to count the number of missing values is to chain the sum method after isnull (isnull detects NaN or None)
# Pandas numerically evaluates False/True as 0/1, so the sum method returns the number of missing values.
actor_1_fb_likes.isnull().sum()

7

In [61]:
# Tips to get directly the percentage of missing values
actor_1_fb_likes.isnull().mean()*100

0.14239218877135884

In [62]:
actor_1_fb_likes.dtype

dtype('float64')

In [63]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

## Making the index meaningful

In [64]:
movie2 = movie.set_index('movie_title')
movie2

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
John Carter,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
Spider-Man 3,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
Tangled,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
Avengers: Age of Ultron,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
Harry Potter and the Half-Blood Prince,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


In [65]:
# Or we can set the index when we read_csv:
movie = pd.read_csv('master/data/movie.csv', index_col='movie_title')

# By default, both set_index and read_csv drop the column used as the index from the DataFrame.
# With set_index, it is possible to keep the column in the DataFrame by setting the drop parameter to False.

In [66]:
# To revert:
# movie2.reset_index()

# This method will put back the index as the first column so the column order won't be
# the same as the original one

## Renaming row and column names

In [67]:
# The rename DataFrame method accepts dictionaries that map the old value
# to the new value. Let's create one for the rows and another for the columns:

In [68]:
idx_rename = {'Avatar':'Ratava', 'Spectre': 'Ertceps'}

In [69]:
col_rename = {'director_name':'Director Name',
              'num_critic_for_reviews': 'Critical Reviews'}

In [70]:
movie_renamed = movie.rename(index=idx_rename,
                            columns=col_rename)
movie_renamed.head()

Unnamed: 0_level_0,color,Director Name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [71]:
# Alternative method:
movie = pd.read_csv('master/data/movie.csv', index_col='movie_title')
index = movie.index
columns = movie.columns

index_list = index.tolist()
column_list = columns.tolist()

# now, rename the row and column labels with list assignements
index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

print(index_list)

['Ratava', "Pirates of the Caribbean: At World's End", 'Ertceps', 'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz the Great and Powerful', 'The Amazing Spider-

In [72]:
print(column_list)

['color', 'Director Name', 'Critical Reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes']


In [73]:
# Finally reassiign the index and columns:
movie.index = index_list
movie.columns = column_list

## Creating and deleting columns

In [74]:
movie = pd.read_csv('master/data/movie.csv')
movie['has_seen'] = 0

In [75]:
movie['actor_director_facebook_likes'] = \
    (movie['actor_1_facebook_likes'] +
     movie['actor_2_facebook_likes'] +
     movie['actor_3_facebook_likes'] +
     movie['director_facebook_likes'])

In [76]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [77]:
movie['actor_director_facebook_likes'] = \
movie['actor_director_facebook_likes'].fillna(0)

In [78]:
# On crée une colonne qui check si la colonne cast existante est plus grande ou = que la colonne que l'on vient de créer.
# Si c'est le cas (True) pour une instance cela signifie que cast à soit :
            # - le même nombre de likes, et qu'il est donc calculé de la meme manière
            # - un nombre supérieur de likes ce qui implique qu'il a davantage de composantes

movie['is_cast_likes_more'] = \
(movie['cast_total_facebook_likes'] >=
 movie['actor_director_facebook_likes'])

In [79]:
# La methode all() nous indique si "all is True"
movie['is_cast_likes_more'].all()

# False est renvoyé ce qui signifie que cast n'a pas systématiquement un nombre égal ou supérieur de like.
# il existe donc au moins un instance pour laquelle cast a un nombre inférieur de likes.
# A priori notre colonne a donc plus de composants que cast car il serait impossible sans cela qu'il y ait une
# instance pour laquelle notre colonne ait un nombre supérieur de likes.

False

In [80]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [81]:
# refaisons donc un essai en mettant moins de composants dans actor_total_facebook_likes

movie['actor_total_facebook_likes'] = \
    (movie['actor_1_facebook_likes'] +
     movie['actor_2_facebook_likes'] +
     movie['actor_3_facebook_likes'])

In [82]:
movie['actor_total_facebook_likes'] = \
    movie['actor_total_facebook_likes'].fillna(0)

In [83]:
movie['is_cast_likes_more'] = \
    (movie['cast_total_facebook_likes'] >=
     movie['actor_total_facebook_likes'])

In [84]:
# Cette fois cast est bien toujours plus grand ou =
movie['is_cast_likes_more'].all()
# donc cast est soit calculé de la meme maniere que actor ou bien cast a plus de composantes que actor.
# Mais cette composantes supplémentaire ne peux pas être director car si on rajoute director
# a actor alors cast a parfois moins de likes

True

In [85]:
movie['pct_actor_cast_like'] = \
     (movie['actor_total_facebook_likes'] /
      movie['cast_total_facebook_likes'])

In [86]:
(movie['pct_actor_cast_like'].min(), 
 movie['pct_actor_cast_like'].max())

(0.0, 1.0)

In [87]:
movie.set_index('movie_title')['pct_actor_cast_like'].head()

movie_title
Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    0.000000
Name: pct_actor_cast_like, dtype: float64

In [1]:
# cast a forcément une composante supplémentaire car actor ne represente qu'un pourcentage de cast.

In [88]:
profit_index = movie.columns.get_loc('gross') + 1

In [89]:
profit_index

9

In [91]:
movie.insert(loc=profit_index,
             column='profit',
             value=movie['gross'] - movie['budget'])

In [92]:
# An alternative to movie = movie.drop('actor_director_facebook_likes', axis='columns')
# is to use a del statement
# del movie['actor_director_facebook_likes']