# Data explorations

In [2]:
import pandas as pd
import numpy as np
import datetime as dt

In [3]:
pd.set_option('display.max_rows', None)

# 1. Đọc và tiền xử lí

## a. Đọc dữ liệu

In [4]:
df = pd.read_csv('data.csv', sep='\t')
df.head(5)

Unnamed: 0,name,overview,tagline,certificate,runtime,genre,keywords,imdb_rate,metascore,director,stars,nvote,gross,release_date,countries,language,locations,company
0,Kẻ Đánh Cắp Giấc Mơ,A thief who steals corporate secrets through t...,Your mind is the scene of the crime,PG-13,148.0,"Action, Adventure, Sci-Fi","dream, ambiguous ending, subconscious, mindben...",8.8,74.0,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",2344754,292.58,"July 16, 2010 (United States)","United States, United Kingdom","English, Japanese, French","Fortress Mountain, Kananaskis Country, Alberta...","Warner Bros., Legendary Entertainment, Syncopy"
1,Hố Đen Tử Thần,A team of explorers travel through a wormhole ...,Mankind was born on Earth. It was never meant ...,C13,169.0,"Adventure, Drama, Sci-Fi","astronaut, saving the world, space travel, wor...",8.6,74.0,Christopher Nolan,"Matthew McConaughey, Anne Hathaway, Jessica Ch...",1821351,188.02,"November 7, 2014 (United States)","United States, United Kingdom, Canada",English,Iceland,"Paramount Pictures, Warner Bros., Legendary En..."
2,Kỵ Sĩ Bóng Đêm Trỗi Dậy,Eight years after the Joker's reign of anarchy...,The Legend Ends,PG-13,164.0,"Action, Drama","dc comics, batman character, bruce wayne chara...",8.4,78.0,Christopher Nolan,"Christian Bale, Tom Hardy, Anne Hathaway, Gary...",1701579,448.14,"July 20, 2012 (United States)","United States, United Kingdom","English, Arabic","Mehrangarh Fort, Jodhpur, Rajasthan, India","Warner Bros., Legendary Entertainment, DC Ente..."
3,Hành Trình Django,"With the help of a German bounty-hunter, a fre...","Life, liberty and the pursuit of vengeance.",R,165.0,"Drama, Western","racial vengeance, racial violence, slavery, on...",8.4,81.0,Quentin Tarantino,"Jamie Foxx, Christoph Waltz, Leonardo DiCaprio...",1549444,162.81,"December 25, 2012 (United States)",United States,"English, German, French, Italian","Evergreen Plantation, 4677 Highway 18, Edgard,...","The Weinstein Company, Columbia Pictures"
4,Sói Già Phố Wall,"Based on the true story of Jordan Belfort, fro...",Earn. Spend. Party.,R,180.0,"Biography, Comedy, Crime","based on true story, stockbroker, female nudit...",8.2,75.0,Martin Scorsese,"Leonardo DiCaprio, Jonah Hill, Margot Robbie, ...",1404990,116.9,"December 25, 2013 (United States)",United States,"English, French","Portofino, Genoa, Liguria, Italy","Red Granite Pictures, Appian Way, Sikelia Prod..."


Các cột:
- `name`: Tên phim
- `year`: Năm phát hành
- `overview`: Mô tả tổng quát phim
- `tagline`
- `certificate`: Phân loại phim ( Ví dụ 'R' là phim cấm người dưới 18 tuổi, 'C13' là cấm người dưới 13 tuổi)
- `runtime`: Thời lượng phim (min)
- `genre`: Thể loại
- `keywords`
- `imdb_rate`: Số điểm đánh giá từ IMDB
- `metascore`: Số điểm đánh giá từ Metacritic
- `director	`: Đạo diễn 
- `stars`: Các ngôi sao của phim
- `nvote`: Số lượt đánh giá
- `cross`: Doanh thu của phim (M)
- `release_date`: Ngày phát hành
- `countries`:
- `language` :
- `locations` :
- `company` :

Các dòng: Mỗi dòng là thông tin của một bộ phim

## b. Khám phá dữ liệu

### Kiểu dữ liệu:

In [4]:
df.dtypes

name             object
overview         object
tagline          object
certificate      object
runtime         float64
genre            object
keywords         object
imdb_rate       float64
metascore       float64
director         object
stars            object
nvote             int64
gross           float64
release_date     object
countries        object
language         object
locations        object
company          object
dtype: object

Các cột `name` `tagline` `overview` `certificate` `keywords` `genre` `director` `release_date` `stars` `language` `locations` `company` có type là `object`. Cần tìm type chính xác của các cột này

In [5]:
def open_object_dtype(s):
    dtypes = set()
    dtypes = set(s.apply(type))
    return dtypes

In [6]:
print('name:', open_object_dtype(df['name']))
print('overview:', open_object_dtype(df['overview']))
print('tagline:', open_object_dtype(df['tagline']))
print('keywords:', open_object_dtype(df['keywords']))
print('certificate:', open_object_dtype(df['certificate']))
print('genre:', open_object_dtype(df['genre']))
print('director:', open_object_dtype(df['director']))
print('stars:', open_object_dtype(df['stars']))
print('release_date:', open_object_dtype(df['release_date']))
print('language:', open_object_dtype(df['language']))
print('locations:', open_object_dtype(df['locations']))
print('company:', open_object_dtype(df['company']))

name: {<class 'str'>}
overview: {<class 'str'>}
tagline: {<class 'str'>, <class 'float'>}
keywords: {<class 'str'>, <class 'float'>}
certificate: {<class 'str'>, <class 'float'>}
genre: {<class 'str'>}
director: {<class 'str'>}
stars: {<class 'str'>}
release_date: {<class 'str'>}
language: {<class 'str'>}
locations: {<class 'str'>, <class 'float'>}
company: {<class 'str'>}


Column `keywords` `tagline` `certificate` `locations` có một số giá trị có kiểu `float` do mang giá trị NULL

Các column `keywords` `genre` `director` `stars` `language` `locations` `company`  phải là list tuy nhiên đang có type là 'str'

`release_date` chưa có kiểu dữ liệu là datetime

#### Chỉnh sửa các column về kiểu dữ liệu phù hợp

* Các column dạng list

In [7]:
df['keywords'] = df['keywords'].apply(lambda x: x.split(", ") if(type(x) == str) else x)
df['genre'] = df['genre'].apply(lambda x: x.split(", "))
df['director'] = df['director'].apply(lambda x: x.split(", "))
df['stars'] = df['stars'].apply(lambda x: x.split(", "))
df['language'] = df['language'].apply(lambda x: x.split(", "))
df['locations'] = df['locations'].apply(lambda x: x.split(", ") if(type(x) == str) else x)
df['company'] = df['company'].apply(lambda x: x.split(", "))

* Column `release_date`

In [8]:
df[['release_date']].sample(n = 10)

Unnamed: 0,release_date
573,"December 25, 2011 (United States)"
86,"May 18, 2018 (United States)"
420,"February 14, 2013 (United States)"
261,"January 11, 2013 (United States)"
493,"March 28, 2013 (United States)"
231,"December 25, 2020 (United States)"
731,"March 6, 2014 (United States)"
318,"October 16, 2015 (United Kingdom)"
855,"January 29, 2021 (United States)"
136,"September 19, 2014 (United States)"


In [9]:
def to_datetime(date):
    date = date.split(' (')[0]
    format_ = '%B %d, %Y'
    return dt.datetime.strptime(date, format_).date()

In [10]:
df['release_date'] = df['release_date'].apply(lambda x: to_datetime(x))

* Thêm column `year` vào dataframe

In [11]:
df['year'] = df['release_date'].apply(lambda x: x.year)

### Phân bố của từng column

### - Numeric column

In [12]:
def get_num_col_profle(df, numcol):
    missing_ratio = []
    min_ = []
    max_ = []
    for col in nume_col:
        collumn = df[col]
        missing_ratio.append(collumn.isnull().sum() / len(collumn) * 100)
        min_.append(collumn.min())
        max_.append(collumn.max())
    profile = pd.DataFrame(([ missing_ratio, min_, max_]), columns = nume_col)
    
    index = pd.Series(["missing_ratio(%)", "min", "max"])
    profile['Value'] = index
    profile = profile.set_index('Value')
    return profile

nume_col = ["runtime","release_date", "imdb_rate", "metascore", "nvote", "gross"]
nume_col_profiles_df = get_num_col_profle(df, nume_col)
nume_col_profiles_df

Unnamed: 0_level_0,runtime,release_date,imdb_rate,metascore,nvote,gross
Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
missing_ratio(%),0.0,0.0,0.0,1.9,0.0,9.5
min,80.0,2010-01-08,1.1,18.0,88440.0,0.01
max,321.0,2022-06-24,8.8,100.0,2344754.0,936.66


### - Categorical column

In [13]:
def get_cate_col_profiles(df, cate_col):
    missing_ratio = []
    num_diff_vals = []
    diff_vals = []
    for col in cate_col:
        column = df[col].dropna()
        missing_ratio.append( 100 - len(column) / 1000 * 100)
        diff_val = column.to_list()
        if type(column[0]) == list:
            diff_val = pd.Series(sum(diff_val, [])).unique()
        else:
            diff_val = pd.Series(diff_val).unique()
        num_diff_vals.append(len(diff_val))
        diff_vals.append(diff_val)
    profile = pd.DataFrame(([ missing_ratio, num_diff_vals, diff_vals]), columns = cate_col)
    
    index = pd.Series(["missing_ratio%", "num_diff_vals", "diff_vals"])
    profile['Value'] = index
    profile = pd.DataFrame(profile.set_index('Value'))
    
    return profile
cate_col = ['name', 'tagline', 'keywords', 'year', 'overview', 'certificate', 'genre', 'director', 'stars', 'language', 'locations', 'company' ]
cate_col_profiles_df = get_cate_col_profiles(df, cate_col)
cate_col_profiles_df

Unnamed: 0_level_0,name,tagline,keywords,year,overview,certificate,genre,director,stars,language,locations,company
Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
missing_ratio%,0.0,7.2,0.1,0.0,0.0,3.3,0.0,0.0,0.0,0.0,3.6,0.0
num_diff_vals,999,926,2911,13,1000,10,20,632,1795,104,1210,979
diff_vals,"[Kẻ Đánh Cắp Giấc Mơ, Hố Đen Tử Thần, Kỵ Sĩ Bó...","[Your mind is the scene of the crime, Mankind ...","[dream, ambiguous ending, subconscious, mindbe...","[2010, 2014, 2012, 2013, 2019, 2018, 2016, 201...",[A thief who steals corporate secrets through ...,"[PG-13, C13, R, P, C18, C16, (Banned), PG, G, ...","[Action, Adventure, Sci-Fi, Drama, Western, Bi...","[Christopher Nolan, Quentin Tarantino, Martin ...","[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...","[English, Japanese, French, Arabic, German, It...","[Fortress Mountain, Kananaskis Country, Albert...","[Warner Bros., Legendary Entertainment, Syncop..."


# 2. Đặt câu hỏi: