# Dataset Exploration

In [1]:
import numpy as np
import pandas as pd
# Restricting number of displaying rows, just for convenience
pd.set_option('max_rows', 8)

## Load data

In [2]:
films = pd.read_csv('data/movie.csv')
films.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## Descriptive attributes

In [7]:
# Types of columns
films.dtypes

color                      object
director_name              object
num_critic_for_reviews    float64
duration                  float64
                           ...   
actor_2_facebook_likes    float64
imdb_score                float64
aspect_ratio              float64
movie_facebook_likes        int64
Length: 28, dtype: object

In [32]:
# Number of dimensions - 1 for Series and 2 for DataFrame
films.ndim

2

In [9]:
# Shape of dataframe - its height (number of rows) and width (number of columns)
films.shape

(4916, 28)

In [8]:
# Number of elements in df - product of width and height
films.size

137648

In [33]:
# Return series with number of nonmissing values in columns
films.count()

color                     4897
director_name             4814
num_critic_for_reviews    4867
duration                  4901
                          ... 
actor_2_facebook_likes    4903
imdb_score                4916
aspect_ratio              4590
movie_facebook_likes      4916
Length: 28, dtype: int64

`describe` method is used to take a look into data. Depending on column type its output will vary  
It has these parameters:
* percentiles - list of percentiles to fetch, [0.25, 0.50, 0.75] by default
* include - dtype or list of dtypes to conduct analysis on
* exclude - dtype or list of dtypes which will be excluded from analysis

In [42]:
# Descriptive statistics for numeric data includes 
# min() 
# max()
# mean()
# std()
# count() method results and 3 quantiles
films.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


Note that in the result above no columns with object type - they are dropped. It is default behaviour when you passed dataframe with mixed categorical and numerical data types.  
To circumvent it we can pass `include='all'` as an argument or filter out numeric types before calling method.  
Descriptive statistics for categorical data includes count, number of unique values, most frequent one and its frequency

In [43]:
films.describe(include='all')

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4897,4814,4867.0,4901.0,4814.00,4893.0,4903,4909.0,4.054000e+03,4916,...,4895.0,4904,4911,4616,4.432000e+03,4810.0,4903.0,4916.0,4590.00,4916.0
unique,2,2397,,,,,3030,,,914,...,,47,65,18,,,,,,
top,Color,Steven Spielberg,,,,,Morgan Freeman,,,Drama,...,,English,USA,R,,,,,,
freq,4693,26,,,,,18,,,233,...,,4582,3710,2067,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25%,,,49.0,93.0,7.00,132.0,,607.0,5.019656e+06,,...,64.0,,,,6.000000e+06,1999.0,277.0,5.8,1.85,0.0
50%,,,108.0,103.0,48.00,366.0,,982.0,2.504396e+07,,...,153.0,,,,1.985000e+07,2005.0,593.0,6.6,2.35,159.0
75%,,,191.0,118.0,189.75,633.0,,11000.0,6.110841e+07,,...,320.5,,,,4.300000e+07,2011.0,912.0,7.2,2.35,2000.0
max,,,813.0,511.0,23000.00,23000.0,,640000.0,7.605058e+08,,...,5060.0,,,,4.200000e+09,2016.0,137000.0,9.5,16.00,349000.0


In [40]:
# Select object columns and describe them
films.select_dtypes(object).describe()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
count,4897,4814,4903,4916,4909,4916,4893,4764,4916,4904,4911,4616
unique,2,2397,3030,914,2095,4916,3519,4756,4916,47,65,18
top,Color,Steven Spielberg,Morgan Freeman,Drama,Robert De Niro,The American,Steve Coogan,based on novel,http://www.imdb.com/title/tt1211956/?ref_=fn_t...,English,USA,R
freq,4693,26,18,233,48,1,8,4,1,4582,3710,2067


In [22]:
# Method for finding percentiles only, takes iterable with percentiles or 1 value and return 
# dataframe or series correspondingly
# Can operate on rows or on columns - axis parameter
films.quantile((0.1, 0.9))

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0.1,17.0,86.0,0.0,32.0,240.0,374419.1,1593.5,509.5,0.0,21.0,1380020.0,1988.0,78.0,5.0,1.85,0.0
0.9,294.0,134.0,545.0,890.8,18000.0,122902852.2,213880.5,25594.5,4.0,620.6,80000000.0,2014.0,3000.0,7.8,2.35,23000.0


In addition there is an `info()` method showing column names, number of non missed samples, column dtypes and memory usage

In [28]:
films.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
color                        4897 non-null object
director_name                4814 non-null object
num_critic_for_reviews       4867 non-null float64
duration                     4901 non-null float64
director_facebook_likes      4814 non-null float64
actor_3_facebook_likes       4893 non-null float64
actor_2_name                 4903 non-null object
actor_1_facebook_likes       4909 non-null float64
gross                        4054 non-null float64
genres                       4916 non-null object
actor_1_name                 4909 non-null object
movie_title                  4916 non-null object
num_voted_users              4916 non-null int64
cast_total_facebook_likes    4916 non-null int64
actor_3_name                 4893 non-null object
facenumber_in_poster         4903 non-null float64
plot_keywords                4764 non-null object
movie_imdb_link              4916 non-

### Additional note about describe

In `describe()` most results are evaluated with skipping NA. So missed observations don't take part in computations. It can be repaired by using included in `describe()` functions.

In [45]:
# Considering NA. It leads to absence of result in column if it has NA.
films.min(skipna=False)

  return umr_minimum(a, axis, None, out, keepdims)


num_critic_for_reviews     NaN
duration                   NaN
director_facebook_likes    NaN
actor_3_facebook_likes     NaN
                          ... 
actor_2_facebook_likes     NaN
imdb_score                 1.6
aspect_ratio               NaN
movie_facebook_likes       0.0
Length: 16, dtype: float64