# Pré Processamento das features
Dataset destinado a aplicar funções de pré-processamento nos dados do estudo

In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("datasets/movies_dataset_cleaned.csv")
data

Unnamed: 0,belongs_to_collection,genres,tmdb_id,imdb_id,original_language,overview,popularity,production_companies,production_countries,release_year,runtime,title,vote_average
0,toy story collection,"['animation', 'comedy', 'family']",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,['pixar_animation_studios'],['us'],1995,81.0,Toy Story,7.7
1,,"['adventure', 'fantasy', 'family']",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,17.015539,"['tristar_pictures', 'teitler_film', 'intersco...",['us'],1995,104.0,Jumanji,6.9
2,grumpy old men collection,"['romance', 'comedy']",15602,tt0113228,en,A family wedding reignites the ancient feud be...,11.712900,"['warner_bros.', 'lancaster_gate']",['us'],1995,101.0,Grumpier Old Men,6.5
3,,"['comedy', 'drama', 'romance']",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,['twentieth_century_fox_film_corporation'],['us'],1995,127.0,Waiting to Exhale,6.1
4,father of the bride collection,['comedy'],11862,tt0113041,en,Just when George Banks has recovered from his ...,8.387519,"['sandollar_productions', 'touchstone_pictures']",['us'],1995,106.0,Father of the Bride Part II,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31202,,"['horror', 'mystery', 'thriller']",84419,tt0038621,en,An unsuccessful sculptor saves a madman named ...,0.222814,['universal_pictures'],['us'],1946,65.0,House of Horrors,6.3
31203,,['horror'],289923,tt0252966,en,A film archivist revisits the story of Rustin ...,0.386450,"['neptune_salad_entertainment', 'pirie_product...",['us'],2000,30.0,The Burkittsville 7,7.0
31204,,['science_fiction'],222848,tt0112613,en,It's the year 3000 AD. The world's most danger...,0.661558,['concorde-new_horizons'],['us'],1995,85.0,Caged Heat 3000,3.5
31205,,"['drama', 'action', 'romance']",30840,tt0102797,en,"Yet another version of the classic epic, with ...",5.683753,"['westdeutscher_rundfunk_(wdr)', 'working_titl...","['ca', 'de', 'gb', 'us']",1991,104.0,Robin Hood,5.7


## One hot encoding na colunas de 'genres'

In [4]:
# Quais gêneros estão no dataset?
data_genres = data['genres']
genres_list = []
for aux_row in data_genres:
    genres_list.extend(aux_row.replace("'", "").strip("[]").split(", "))

for genre in np.unique(genres_list):
    data[f"genre_{genre}"] = [int(genre in row.replace("'", "").strip("[]").split(", ")) for row in data_genres]

genres_list = []
for aux_row in data_genres:
    try:
        x = aux_row.replace("'", "").strip("[]").split(", ")
    except:
        x = ""
    genres_list.append(", ".join(x))

data["genres"] = genres_list

In [5]:
data["genres"]

0         animation, comedy, family
1        adventure, fantasy, family
2                   romance, comedy
3            comedy, drama, romance
4                            comedy
                    ...            
31202     horror, mystery, thriller
31203                        horror
31204               science_fiction
31205        drama, action, romance
31206       action, drama, thriller
Name: genres, Length: 31207, dtype: object

## Separando popularidade em classes

In [6]:
classes = [0 if pop < 5 else 1 if pop < 10 else 2 if pop < 15 else 3 if pop < 20 else 4 if pop < 25 else 5 for pop in data["popularity"]]
data["popularity_class"] = classes

data.head()

Unnamed: 0,belongs_to_collection,genres,tmdb_id,imdb_id,original_language,overview,popularity,production_companies,production_countries,release_year,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science_fiction,genre_thriller,genre_war,genre_western,popularity_class
0,toy story collection,"animation, comedy, family",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,['pixar_animation_studios'],['us'],1995,...,0,0,0,0,0,0,0,0,0,4
1,,"adventure, fantasy, family",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,17.015539,"['tristar_pictures', 'teitler_film', 'intersco...",['us'],1995,...,0,0,0,0,0,0,0,0,0,3
2,grumpy old men collection,"romance, comedy",15602,tt0113228,en,A family wedding reignites the ancient feud be...,11.7129,"['warner_bros.', 'lancaster_gate']",['us'],1995,...,0,0,0,0,1,0,0,0,0,2
3,,"comedy, drama, romance",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,['twentieth_century_fox_film_corporation'],['us'],1995,...,0,0,0,0,1,0,0,0,0,0
4,father of the bride collection,comedy,11862,tt0113041,en,Just when George Banks has recovered from his ...,8.387519,"['sandollar_productions', 'touchstone_pictures']",['us'],1995,...,0,0,0,0,0,0,0,0,0,1


## Nome completo dos idiomas

In [7]:
languages = {
    "de":"german",
    "en":"english",
    "es":"spanish",
    "fr":"french",
    "hi":"hindi",
    "it":"italian",
    "ja":"japanese",
    "ko":"korean",
    "pt":"portuguese",
    "ru":"russian",
    "zh":"chinese"
}
data["original_language"] = data["original_language"].replace(languages)

In [8]:
## One hot encoding nos idiomas originais
for language in np.unique(data["original_language"]):
    data[f"language_{language}"] = (data["original_language"] == language).values.astype(int)

In [9]:
data

Unnamed: 0,belongs_to_collection,genres,tmdb_id,imdb_id,original_language,overview,popularity,production_companies,production_countries,release_year,...,language_english,language_french,language_german,language_hindi,language_italian,language_japanese,language_korean,language_portuguese,language_russian,language_spanish
0,toy story collection,"animation, comedy, family",862,tt0114709,english,"Led by Woody, Andy's toys live happily in his ...",21.946943,['pixar_animation_studios'],['us'],1995,...,1,0,0,0,0,0,0,0,0,0
1,,"adventure, fantasy, family",8844,tt0113497,english,When siblings Judy and Peter discover an encha...,17.015539,"['tristar_pictures', 'teitler_film', 'intersco...",['us'],1995,...,1,0,0,0,0,0,0,0,0,0
2,grumpy old men collection,"romance, comedy",15602,tt0113228,english,A family wedding reignites the ancient feud be...,11.712900,"['warner_bros.', 'lancaster_gate']",['us'],1995,...,1,0,0,0,0,0,0,0,0,0
3,,"comedy, drama, romance",31357,tt0114885,english,"Cheated on, mistreated and stepped on, the wom...",3.859495,['twentieth_century_fox_film_corporation'],['us'],1995,...,1,0,0,0,0,0,0,0,0,0
4,father of the bride collection,comedy,11862,tt0113041,english,Just when George Banks has recovered from his ...,8.387519,"['sandollar_productions', 'touchstone_pictures']",['us'],1995,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31202,,"horror, mystery, thriller",84419,tt0038621,english,An unsuccessful sculptor saves a madman named ...,0.222814,['universal_pictures'],['us'],1946,...,1,0,0,0,0,0,0,0,0,0
31203,,horror,289923,tt0252966,english,A film archivist revisits the story of Rustin ...,0.386450,"['neptune_salad_entertainment', 'pirie_product...",['us'],2000,...,1,0,0,0,0,0,0,0,0,0
31204,,science_fiction,222848,tt0112613,english,It's the year 3000 AD. The world's most danger...,0.661558,['concorde-new_horizons'],['us'],1995,...,1,0,0,0,0,0,0,0,0,0
31205,,"drama, action, romance",30840,tt0102797,english,"Yet another version of the classic epic, with ...",5.683753,"['westdeutscher_rundfunk_(wdr)', 'working_titl...","['ca', 'de', 'gb', 'us']",1991,...,1,0,0,0,0,0,0,0,0,0


## Transformando países da produção e companhias em texto

In [10]:
# Quais companhias estão no dataset?
production_companies = data['production_companies']
p_companies_list = []
for aux_row in production_companies:
    try:
        x = aux_row.replace("'", "").strip("[]").split(", ")
    except:
        x = ""
    p_companies_list.append(", ".join(x))


data['production_companies'] = p_companies_list

In [11]:
# Quais paises estão no dataset?
countries_name = {
    'ar': 'Argentina', 'at': 'Austria', 'au': 'Australia', 'be': 'Belgium', 'br': 'Brazil',
    'ca': 'Canada', 'ch': 'Switzerland', 'cn': 'China', 'de': 'Germany', 'dk': 'Denmark', 'es': 'Spain',
    'fr': 'France', 'gb': 'United Kingdom', 'hk': 'Hong Kong', 'ie': 'Ireland', 'in': 'India',
    'it': 'Italy', 'jp': 'Japan', 'kr': 'Korea', 'mx': 'Mexico', 'nl': 'Netherlands', 'nz': 'New Zealand',
    'pt': 'Portugal', 'ru': 'Russia', 'se': 'Sweden', 'tw': 'Taiwan', 'us': 'United States', 'za': 'South Africa'
}
production_countries = data['production_countries']
p_countries_list = []
for aux_row in production_countries:
    x = aux_row.replace("'", "").strip("[]").split(", ")
    y = [countries_name[country] for country in x]
    p_countries_list.append(", ".join(y))
    
data['production_countries'] = p_countries_list

## Versão final

In [12]:
data

Unnamed: 0,belongs_to_collection,genres,tmdb_id,imdb_id,original_language,overview,popularity,production_companies,production_countries,release_year,...,language_english,language_french,language_german,language_hindi,language_italian,language_japanese,language_korean,language_portuguese,language_russian,language_spanish
0,toy story collection,"animation, comedy, family",862,tt0114709,english,"Led by Woody, Andy's toys live happily in his ...",21.946943,pixar_animation_studios,United States,1995,...,1,0,0,0,0,0,0,0,0,0
1,,"adventure, fantasy, family",8844,tt0113497,english,When siblings Judy and Peter discover an encha...,17.015539,"tristar_pictures, teitler_film, interscope_com...",United States,1995,...,1,0,0,0,0,0,0,0,0,0
2,grumpy old men collection,"romance, comedy",15602,tt0113228,english,A family wedding reignites the ancient feud be...,11.712900,"warner_bros., lancaster_gate",United States,1995,...,1,0,0,0,0,0,0,0,0,0
3,,"comedy, drama, romance",31357,tt0114885,english,"Cheated on, mistreated and stepped on, the wom...",3.859495,twentieth_century_fox_film_corporation,United States,1995,...,1,0,0,0,0,0,0,0,0,0
4,father of the bride collection,comedy,11862,tt0113041,english,Just when George Banks has recovered from his ...,8.387519,"sandollar_productions, touchstone_pictures",United States,1995,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31202,,"horror, mystery, thriller",84419,tt0038621,english,An unsuccessful sculptor saves a madman named ...,0.222814,universal_pictures,United States,1946,...,1,0,0,0,0,0,0,0,0,0
31203,,horror,289923,tt0252966,english,A film archivist revisits the story of Rustin ...,0.386450,"neptune_salad_entertainment, pirie_productions",United States,2000,...,1,0,0,0,0,0,0,0,0,0
31204,,science_fiction,222848,tt0112613,english,It's the year 3000 AD. The world's most danger...,0.661558,concorde-new_horizons,United States,1995,...,1,0,0,0,0,0,0,0,0,0
31205,,"drama, action, romance",30840,tt0102797,english,"Yet another version of the classic epic, with ...",5.683753,"westdeutscher_rundfunk_(wdr), working_title_fi...","Canada, Germany, United Kingdom, United States",1991,...,1,0,0,0,0,0,0,0,0,0


In [13]:
data.columns

Index(['belongs_to_collection', 'genres', 'tmdb_id', 'imdb_id',
       'original_language', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_year', 'runtime', 'title',
       'vote_average', 'genre_action', 'genre_adventure', 'genre_animation',
       'genre_comedy', 'genre_crime', 'genre_documentary', 'genre_drama',
       'genre_family', 'genre_fantasy', 'genre_foreign', 'genre_history',
       'genre_horror', 'genre_music', 'genre_mystery', 'genre_romance',
       'genre_science_fiction', 'genre_thriller', 'genre_war', 'genre_western',
       'popularity_class', 'language_chinese', 'language_english',
       'language_french', 'language_german', 'language_hindi',
       'language_italian', 'language_japanese', 'language_korean',
       'language_portuguese', 'language_russian', 'language_spanish'],
      dtype='object')

# Separação do dataset e salvamento dos dados
- Dataset FULL: Contém todas as colunas trabalhadas
- Dataset METADATA: Contém as colunas com metadados como gêneros, companhias, resumo
- Dataset MODEL: Contém as colunas que serão usadas na modelagem para agrupar os filmes semelhantes

In [18]:
data_metadata = data[['genres', 'release_year', 'runtime', 'vote_average', 'overview',
       'title', 'original_language', 'popularity', 'production_countries',
       'production_companies', 'belongs_to_collection',
       'imdb_id', 'tmdb_id']]

data_model = data[['release_year', 'runtime',
       'tmdb_id', 'genre_action', 'genre_adventure',
       'genre_animation', 'genre_comedy', 'genre_crime', 'genre_documentary',
       'genre_drama', 'genre_family', 'genre_fantasy', 'genre_foreign',
       'genre_history', 'genre_horror', 'genre_music', 'genre_mystery',
       'genre_romance', 'genre_science_fiction', 'genre_thriller', 'genre_war',
       'genre_western', 'language_chinese',
       'language_english', 'language_french', 'language_german',
       'language_hindi', 'language_italian', 'language_japanese',
       'language_korean', 'language_portuguese', 'language_russian',
       'language_spanish']]

In [19]:
data.to_csv("datasets/movies_dataset_full.csv")
data_metadata.to_csv("datasets/movies_dataset_metadata.csv")
data_model.to_csv("datasets/movies_dataset_model.csv")