In [1]:
"""
This script transform the data files into a standard format (columns,names,order etc.).
In the feature_engineering script you only generate features and not change format.
It has to be that way so that when we want to generate a feature matrix from the test data we'll need only 
to refomatted it according to the train format and then use the feature engineering script as is.
For example, if we take the 5 most popular genre as onehot columns, they could be different in train and test, therefore,
we'll need to adjust the test according to the train and continue from there.
"""

# TODO: Add external data sources
# Kaggle original data - need to populate it with vote count/avg (https://github.com/jasonliu119/movie_predict)
                        # Integrate it 
                        # check for duplications
                        # fix fields (in all data)

"\nThis script transform the data files into a standard format (columns,names,order etc.).\nIn the feature_engineering script you only generate features and not change format.\nIt has to be that way so that when we want to generate a feature matrix from the test data we'll need only \nto refomatted it according to the train format and then use the feature engineering script as is.\nFor example, if we take the 5 most popular genre as onehot columns, they could be different in train and test, therefore,\nwe'll need to adjust the test according to the train and continue from there.\n"

In [1]:
import pandas as pd
import numpy as np 
import json
import ast
from collections import Counter
# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
DATA_FOLDER = "/StudentData/hw1/"

In [3]:
train = pd.read_csv(DATA_FOLDER + "train.tsv",sep="\t")
test = pd.read_csv(DATA_FOLDER + "test.tsv",sep="\t")
test.head()
one_hot_columns = {}

In [4]:
# fixing fields from external source
dataset = pd.concat([train, test], ignore_index=True, sort=True)
train.shape, test.shape, dataset.shape

((5215, 27), (1738, 27), (6953, 27))

In [5]:
def get_values(elem_str,field):
    elem_list = [] if pd.isna(elem_str) else ast.literal_eval(elem_str)
    if isinstance(elem_list, dict):
        elem_list = [elem_list]
    values = []
    for elem in elem_list:
        values.append(elem[field])
    return values


def json_to_onehot(df,col,field,cutoff_cretria):
    column_format = lambda val : col+"_"+str(val)
    possible_values = Counter()
    df[col].apply(lambda elem_str : possible_values.update(get_values(elem_str,field)))
    possible_values = sorted(cutoff_cretria(possible_values))
    one_hot_columns[col] = possible_values
    new_columns = [column_format(val) for val in possible_values]
    for new_col in new_columns:
        df[new_col] = 0
    df[new_columns].replace(np.nan, 0, inplace=True)
    print(col," added ",len(new_columns), " columns")
#     print(col,field,possible_values)
    for i,row in df.iterrows():
        for val in [elem for elem in get_values(row[col],field) if elem in possible_values]:
            df.loc[i,column_format(val)] = 1


# def add_one_hot(df,col,cutoff_cretria):
#     possible_values = Counter(df[col])
#     possible_values = sorted(cutoff_cretria(possible_values))
            
            
def reformat_data(df_input,max_number_of_columns,threshold_cutoff):  
    df = df_input.copy()
    # adding one hot columns only if the number of samples reaches the threshold
    cutoff_cretria = lambda possible_values : [field_value for field_value, num_samples in possible_values.most_common(max_number_of_columns) if num_samples>=threshold_cutoff]
    json_to_onehot(df,"belongs_to_collection","name",cutoff_cretria)
    json_to_onehot(df,"genres","name",cutoff_cretria)
    json_to_onehot(df,"production_companies","id",cutoff_cretria)
    json_to_onehot(df,"production_countries","name",cutoff_cretria)
    json_to_onehot(df,"spoken_languages","iso_639_1",cutoff_cretria)
    json_to_onehot(df,"cast","name",cutoff_cretria)
    json_to_onehot(df,"crew","name",cutoff_cretria)
#     one_hot_lang = pd.get_dummies(df['original_language'],prefix='original_language')
#     df = df.join(one_hot_lang)
#     chosen_columns = ['budget','homepage', 'original_title', 'overview', 'popularity', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']
#     add_columns = df.columns[len(df_input.columns):]
#     df = df[list(chosen_columns) + list(add_columns)]
    return df 

In [6]:
# train_reformated = reformat_data(train,20,5)
# train_reformated.head()
dataset_reformated = reformat_data(dataset,20,5)
dataset_reformated

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


belongs_to_collection  added  20  columns
genres  added  19  columns
production_companies  added  20  columns
production_countries  added  20  columns
spoken_languages  added  20  columns
cast  added  20  columns
crew  added  20  columns


Unnamed: 0,Keywords,backdrop_path,belongs_to_collection,budget,cast,crew,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,belongs_to_collection_A Nightmare on Elm Street Collection,belongs_to_collection_Child's Play Collection,belongs_to_collection_Die Hard Collection,belongs_to_collection_Friday the 13th Collection,belongs_to_collection_Halloween Collection,belongs_to_collection_Harry Potter Collection,belongs_to_collection_Ice Age Collection,belongs_to_collection_James Bond Collection,belongs_to_collection_One Piece Collection,belongs_to_collection_Paranormal Activity Collection,belongs_to_collection_Pirates of the Caribbean Collection,belongs_to_collection_Pokémon Collection,belongs_to_collection_Police Academy Collection,belongs_to_collection_Rocky Collection,belongs_to_collection_Saw Collection,belongs_to_collection_Star Trek: The Original Series Collection,belongs_to_collection_The Exorcist Collection,belongs_to_collection_The Muppet Collection,belongs_to_collection_The Pink Panther (Original) Collection,belongs_to_collection_The Terminator Collection,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western,production_companies_2,production_companies_4,production_companies_5,production_companies_7,production_companies_12,production_companies_14,production_companies_21,production_companies_25,production_companies_33,production_companies_34,production_companies_60,production_companies_79,production_companies_104,production_companies_174,production_companies_508,production_companies_559,production_companies_694,production_companies_1632,production_companies_7295,production_companies_9195,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_Denmark,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,spoken_languages_ar,spoken_languages_cn,spoken_languages_cs,spoken_languages_de,spoken_languages_en,spoken_languages_es,spoken_languages_fr,spoken_languages_he,spoken_languages_hi,spoken_languages_hu,spoken_languages_it,spoken_languages_ja,spoken_languages_ko,spoken_languages_la,spoken_languages_pl,spoken_languages_pt,spoken_languages_ru,spoken_languages_sv,spoken_languages_tr,spoken_languages_zh,cast_Alec Baldwin,cast_Bruce Willis,cast_Dennis Quaid,cast_Frank Welker,cast_J.K. Simmons,cast_John Goodman,cast_John Leguizamo,cast_Johnny Depp,cast_Keith David,cast_Liam Neeson,cast_Matt Damon,cast_Morgan Freeman,cast_Nicolas Cage,cast_Richard Jenkins,cast_Robert De Niro,cast_Robin Williams,cast_Samuel L. Jackson,cast_Steve Buscemi,cast_Sylvester Stallone,cast_Willem Dafoe,crew_Alan Silvestri,crew_Avy Kaufman,crew_Barbara Harris,crew_Bob Weinstein,crew_Dan O'Connell,crew_Dan Perri,crew_Deborah Aquila,crew_Gary Burritt,crew_Hans Zimmer,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_John T. Cucci,crew_Kerry Barden,crew_Mary Vernieu,crew_Mo Henry,crew_Nerses Gezalyan,crew_Robert Rodriguez,crew_Steven Spielberg,crew_Tricia Wood
0,"[{'id': 697, 'name': 'loss of loved one'}, {'i...",/7IBpOrw0ATwL1AOV97mtsceDpYs.jpg,"{'id': 556, 'name': 'Spider-Man Collection', '...",258000000,"[{'cast_id': 30, 'character': 'Peter Parker / ...","[{'credit_id': '52fe4252c3a36847f80151a5', 'de...","[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",https://www.sonypictures.com/movies/spiderman3,559,tt0413300,en,Spider-Man 3,The seemingly invincible Spider-Man goes up ag...,22.024,/2N9lhZg6VtVJoGCZDjXVC3a81Ea.jpg,"[{'id': 19551, 'logo_path': '/2WpWp9b108hizjHK...","[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-01,890871626,139.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The battle within.,Spider-Man 3,False,6.2,8180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"[{'id': 65, 'name': 'holiday'}, {'id': 1991, '...",/lYeE7k0OR3HXyoq7FeswyaxFJvL.jpg,"{'id': 256296, 'name': 'Silent Night, Deadly N...",250000,"[{'cast_id': 1000, 'character': 'Ricky Caldwel...","[{'credit_id': '52fe47d4c3a36847f814a70f', 'de...","[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",,50719,tt0093974,en,"Silent Night, Deadly Night Part 2",After being traumatized by his brother Billy's...,4.756,/64ATtm2eMd0yfrTJsHANQ0NmMSn.jpg,"[{'id': 18924, 'logo_path': None, 'name': 'Sil...","[{'iso_3166_1': 'US', 'name': 'United States o...",1987-04-10,154323,88.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The nightmare is about to begin ... AGAIN!,"Silent Night, Deadly Night Part 2",False,4.3,68,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"[{'id': 158718, 'name': 'lgbt'}]",/gVXLIfMR2hLmkn0nACctlMCJBfx.jpg,,0,"[{'cast_id': 1, 'character': 'Himself', 'credi...","[{'credit_id': '5980e931c3a3680cfb002d0e', 'de...","[{'id': 99, 'name': 'Documentary'}]",https://www.scottymovie.com/,469062,tt2773246,en,Scotty and the Secret History of Hollywood,A deliciously scandalous portrait of unsung Ho...,4.746,/2uXDNXBoIrRBbfpVrvegoLr8OVt.jpg,"[{'id': 88564, 'logo_path': '/pn3p12IC4Tb0K8re...","[{'iso_3166_1': 'US', 'name': 'United States o...",2018-07-27,176236,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Scotty and the Secret History of Hollywood,False,6.5,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"[{'id': 1449, 'name': 'underworld'}, {'id': 31...",/rKjE17ncAGNzeImNWbdGTimzjtk.jpg,"{'id': 8917, 'name': 'Hellraiser Collection', ...",1000000,"[{'cast_id': 15, 'character': 'Larry Cotton', ...","[{'credit_id': '533fd9260e0a262b92001027', 'de...","[{'id': 27, 'name': 'Horror'}]",,9003,tt0093177,en,Hellraiser,An unfaithful wife encounters the zombie of he...,13.828,/4nfAhOTlfZUHNorHJXEib7GYFpp.jpg,"[{'id': 1950, 'logo_path': None, 'name': 'New ...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]",1987-09-11,14564027,94.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He'll tear your soul apart.,Hellraiser,False,6.9,1115,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"[{'id': 5493, 'name': 'relatives'}, {'id': 731...",/zcJxJVhvxNHJJ2J7Q7NhgO9nPUV.jpg,"{'id': 108693, 'name': ""National Lampoon's Vac...",15000000,"[{'cast_id': 1, 'character': 'Clark Griswold',...","[{'credit_id': '52fe44039251416c75025f27', 'de...","[{'id': 35, 'name': 'Comedy'}, {'id': 12, 'nam...",,11153,tt0085995,en,National Lampoon's Vacation,Clark Griswold is on a quest to take his famil...,15.070,/ySI88wO1IFyKGWpSKRTSPilE3t2.jpg,"[{'id': 174, 'logo_path': '/IuAlhI9eVC9Z8UQWOI...","[{'iso_3166_1': 'US', 'name': 'United States o...",1983-07-28,61399552,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Every summer Chevy Chase takes his family on a...,National Lampoon's Vacation,False,7.1,782,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6948,"[{'id': 331, 'name': 'tattoo'}, {'id': 1794, '...",/mpzIULJhWFVrqylATpcAnGOviYc.jpg,"{'id': 385528, 'name': 'Outrage Collection', '...",0,"[{'cast_id': 4, 'character': ""Mr. Chairman Kan...","[{'credit_id': '52fe46bac3a36847f810efcb', 'de...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",http://www.magnetreleasing.com/outrage/,45284,tt1462667,ja,アウトレイジ,"The story begins with Sekiuchi, boss of the Sa...",5.968,/gNW7mTbXYF11YNIgpqwcTYxzO49.jpg,"[{'id': 6516, 'logo_path': None, 'name': 'Toky...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2010-05-17,8428636,109.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Outrage,False,6.9,170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6949,"[{'id': 833, 'name': 'the white house'}, {'id'...",/dyVFl6I0IQEKE3YAbLKwYLGymyp.jpg,,150000000,"[{'cast_id': 9, 'character': 'Cale', 'credit_i...","[{'credit_id': '52fe4baec3a36847f820f3c3', 'de...","[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,117251,tt2334879,en,White House Down,Capitol Policeman John Cale has just been deni...,12.367,/niYdnzkrtBduR5lKtfeLXKXNaTT.jpg,"[{'id': 34981, 'logo_path': None, 'name': 'Iro...","[{'iso_3166_1': 'US', 'name': 'United States o...",2013-06-27,205366737,131.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It will start like any other day.,White House Down,False,6.4,3123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6950,"[{'id': 9799, 'name': 'romantic comedy'}, {'id...",/fiQnsl4edGJUFuliRIxdsLiVPO1.jpg,,0,"[{'cast_id': 1, 'character': 'Peter Plunkett',...","[{'credit_id': '576c87d69251411540001333', 'de...","[{'id': 14, 'name': 'Fantasy'}, {'id': 35, 'na...",,21362,tt0095304,en,High Spirits,When Peter Plunkett's Irish castle turned hote...,9.143,/mX85r9x5biMJGCtmgXWVbnQQdyK.jpg,"[{'id': 927, 'logo_path': None, 'name': 'Palac...","[{'iso_3166_1': 'IE', 'name': 'Ireland'}, {'is...",1988-11-18,7399763,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He's an American. She's a ghost. Vacation roma...,High Spirits,False,5.8,105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6951,"[{'id': 509, 'name': 'denmark'}, {'id': 2037, ...",/sWNv1EtnBaboIBfrQQ7exon7zub.jpg,,70000000,"[{'cast_id': 24, 'character': 'Beowulf / Golde...","[{'credit_id': '52fe434cc3a36847f8049c1b', 'de...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.beowulfmovie.com/,2310,tt0442933,en,Beowulf,"6th-century Scandinavian warrior, Beowulf emba...",14.250,/j90lLkvlcPNL6VVoLGQ0yzPUfJV.jpg,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...","[{'iso_3166_1': 'US', 'name': 'United States o...",2007-11-05,195735876,115.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Evil breeds pain.,Beowulf,False,5.7,1694,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
dataset_reformated.to_csv(DATA_FOLDER + "dataset_reformated.csv",index=False)

In [8]:
# import pickle
# with open(DATA_FOLDER + "one_hot_columns", 'wb') as file:
#     pickle.dump(one_hot_columns, file)

In [9]:
# Counter(dataset["original_language"]).most_common(20)

[('en', 5802),
 ('fr', 208),
 ('hi', 161),
 ('ja', 110),
 ('es', 97),
 ('ru', 89),
 ('it', 78),
 ('ko', 68),
 ('zh', 54),
 ('de', 50),
 ('cn', 47),
 ('ta', 21),
 ('tr', 20),
 ('ml', 18),
 ('pt', 16),
 ('sv', 15),
 ('da', 12),
 ('ar', 9),
 ('te', 8),
 ('fa', 7)]

In [8]:
one_hot_columns

{'belongs_to_collection': ['A Nightmare on Elm Street Collection',
  "Child's Play Collection",
  'Die Hard Collection',
  'Friday the 13th Collection',
  'Halloween Collection',
  'Harry Potter Collection',
  'Ice Age Collection',
  'James Bond Collection',
  'One Piece Collection',
  'Paranormal Activity Collection',
  'Pirates of the Caribbean Collection',
  'Pokémon Collection',
  'Police Academy Collection',
  'Rocky Collection',
  'Saw Collection',
  'Star Trek: The Original Series Collection',
  'The Exorcist Collection',
  'The Muppet Collection',
  'The Pink Panther (Original) Collection',
  'The Terminator Collection'],
 'genres': ['Action',
  'Adventure',
  'Animation',
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Family',
  'Fantasy',
  'History',
  'Horror',
  'Music',
  'Mystery',
  'Romance',
  'Science Fiction',
  'TV Movie',
  'Thriller',
  'War',
  'Western'],
 'production_companies': [2,
  4,
  5,
  7,
  12,
  14,
  21,
  25,
  33,
  34,
  60,
  79,
  104,
 