In [1]:
"""
This script transform the data files into a standard format (columns,names,order etc.).
In the feature_engineering script you only generate features and not change format.
It has to be that way so that when we want to generate a feature matrix from the test data we'll need only 
to refomatted it according to the train format and then use the feature engineering script as is.
For example, if we take the 5 most popular genre as onehot columns, they could be different in train and test, therefore,
we'll need to adjust the test according to the train and continue from there.
"""

In [1]:
import pandas as pd
import numpy as np 
import json
import ast
from collections import Counter
# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
DATA_FOLDER = "/StudentData/"

In [3]:
train = pd.read_csv(DATA_FOLDER + "train.tsv",sep="\t")
test = pd.read_csv(DATA_FOLDER + "test.tsv",sep="\t")
test.head()

Unnamed: 0,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,Keywords,cast,crew
0,/nNh7vHHISVAaziJEqAq0P9iL52w.jpg,,58000000,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",http://www.lifemovie.com/,395992,tt5442430,en,Life,The six-member crew of the International Space...,17.409,/h2mhfbEBGABSHo2vXG1ECMKAJa7.jpg,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'US', 'name': 'United States o...",2017-03-22,100541806,103.0,"[{'iso_639_1': 'cn', 'name': '广州话 / 廣州話'}, {'i...",Released,Be careful what you search for,Life,False,6.4,4738,"[{'id': 839, 'name': 'planet mars'}, {'id': 48...","[{'cast_id': 0, 'character': 'Dr. David Jordan...","[{'credit_id': '58bd72b0c3a3686630048e38', 'de..."
1,/7dadBR1sXrl1TTzRQYNGUvQinF4.jpg,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",,20832,tt1182908,hi,क्रैजी 4,A psychiatrist spends the majority of his time...,2.257,/8owptcZPbNIqugAHQOzawgRQM5s.jpg,"[{'id': 64779, 'logo_path': None, 'name': 'Fil...","[{'iso_3166_1': 'IN', 'name': 'India'}]",2008-04-11,8000000,110.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,,Krazzy 4,False,4.6,19,[],"[{'cast_id': 4, 'character': 'Raja', 'credit_i...","[{'credit_id': '5cab55c7925141565402137b', 'de..."
2,/ijDUMFmb34Fv7jBwxzhNc3lbVP3.jpg,"{'id': 410261, 'name': 'A Goofy Movie Collecti...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 16, ...",,15789,tt0113198,en,A Goofy Movie,"Though Goofy always means well, his amiable cl...",13.558,/bycmMhO3iIoEDzP768sUjq2RV4T.jpg,"[{'id': 3475, 'logo_path': '/jTPNzDEn7eHmp3nEX...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",1995-04-07,35348597,78.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It’s hard to be cool... when your dad’s Goofy.,A Goofy Movie,False,6.9,964,"[{'id': 970, 'name': 'parent child relationshi...","[{'cast_id': 2, 'character': 'Goofy Goof (voic...","[{'credit_id': '591464b9925141583c00cea9', 'de..."
3,/kb3RNnn4CMPRSaGbrX1ejc7lcRN.jpg,,4000000,"[{'id': 18, 'name': 'Drama'}]",,265180,tt2802154,ru,Левиафан,"In a Russian coastal town, Kolya is forced to ...",7.158,/foMdJ8ijYk5G1Jn0HZRplFTu9Dt.jpg,"[{'id': 5630, 'logo_path': '/s0mHCw53fp6EAapR7...","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",2014-09-24,4396821,141.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Leviathan,False,7.5,447,"[{'id': 1415, 'name': 'small town'}, {'id': 19...","[{'cast_id': 3, 'character': 'Nikolay', 'credi...","[{'credit_id': '5b9c25890e0a266e12001f38', 'de..."
4,/wEo4UtBAWdB4aOmQ9wRjI1aS7Dt.jpg,,250000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,10098,tt0012349,en,The Kid,A tramp cares for a boy after he's abandoned a...,10.523,/drgMcyTsySQBnUPGaBThCHGdlWT.jpg,"[{'id': 3245, 'logo_path': '/9dBTQp9XitrHkx20i...","[{'iso_3166_1': 'US', 'name': 'United States o...",1921-02-06,2500000,68.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,6 reels of Joy.,The Kid,False,8.2,1008,"[{'id': 290, 'name': 'angel'}, {'id': 1252, 'n...","[{'cast_id': 10, 'character': 'A Tramp', 'cred...","[{'credit_id': '52fe43269251416c75005611', 'de..."


In [4]:
def get_values(elem_str,field):
    elem_list = [] if pd.isna(elem_str) else ast.literal_eval(elem_str)
    if isinstance(elem_list, dict):
        elem_list = [elem_list]
    values = []
    for elem in elem_list:
        values.append(elem[field])
    return values


def json_to_onehot(df,col,field,cutoff_cretria):
    column_format = lambda val : col+"_"+str(val)
    possible_values = Counter()
    df[col].apply(lambda elem_str : possible_values.update(get_values(elem_str,field)))
    possible_values = sorted(cutoff_cretria(possible_values))
    new_columns = [column_format(val) for val in possible_values]
    for new_col in new_columns:
        df[new_col] = 0
    df[new_columns].replace(np.nan, 0, inplace=True)
    print(col," added ",len(new_columns), " columns")
    for i,row in df.iterrows():
        for val in [elem for elem in get_values(row[col],field) if elem in possible_values]:
            df.loc[i,column_format(val)] = 1

def reformat_data(df_input,max_number_of_columns,threshold_cutoff):  
    df = df_input.copy()
    # adding one hot columns only if the number of samples reaches the threshold
    cutoff_cretria = lambda possible_values : [field_value for field_value, num_samples in possible_values.most_common(max_number_of_columns) if num_samples>=threshold_cutoff]
    json_to_onehot(df,"belongs_to_collection","name",cutoff_cretria)
    json_to_onehot(df,"genres","name",cutoff_cretria)
    json_to_onehot(df,"production_companies","id",cutoff_cretria)
    json_to_onehot(df,"production_countries","name",cutoff_cretria)
    json_to_onehot(df,"spoken_languages","iso_639_1",cutoff_cretria)
    json_to_onehot(df,"cast","name",cutoff_cretria)
    json_to_onehot(df,"crew","name",cutoff_cretria)
    one_hot_lang = pd.get_dummies(df['original_language'],prefix='original_language')
    df =df.join(one_hot_lang)
    chosen_columns = ['budget','homepage', 'original_title', 'overview', 'popularity', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']
    add_columns = df.columns[len(df_input.columns):]
    df = df[list(chosen_columns) + list(add_columns)]
    return df 

In [5]:
train_reformated = reformat_data(train,20,5)
train_reformated.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


belongs_to_collection  added  9  columns
genres  added  19  columns
production_companies  added  20  columns
production_countries  added  20  columns
spoken_languages  added  20  columns
cast  added  20  columns
crew  added  20  columns


Unnamed: 0,budget,homepage,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,video,vote_average,vote_count,belongs_to_collection_A Nightmare on Elm Street Collection,belongs_to_collection_Friday the 13th Collection,belongs_to_collection_Halloween Collection,belongs_to_collection_James Bond Collection,belongs_to_collection_One Piece Collection,belongs_to_collection_Police Academy Collection,belongs_to_collection_Saw Collection,belongs_to_collection_The Pink Panther (Original) Collection,belongs_to_collection_The Terminator Collection,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western,production_companies_2,production_companies_4,production_companies_5,production_companies_7,production_companies_12,production_companies_14,production_companies_21,production_companies_25,production_companies_33,production_companies_34,production_companies_60,production_companies_79,production_companies_104,production_companies_174,production_companies_559,production_companies_694,production_companies_1632,production_companies_7295,production_companies_9195,production_companies_10163,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_Denmark,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,spoken_languages_ar,spoken_languages_cn,spoken_languages_cs,spoken_languages_de,spoken_languages_en,spoken_languages_es,spoken_languages_fr,spoken_languages_he,spoken_languages_hi,spoken_languages_hu,spoken_languages_it,spoken_languages_ja,spoken_languages_ko,spoken_languages_la,spoken_languages_pl,spoken_languages_pt,spoken_languages_ru,spoken_languages_sv,spoken_languages_tr,spoken_languages_zh,cast_Alec Baldwin,cast_Brad Pitt,cast_Bruce Willis,cast_Dennis Quaid,cast_Frank Welker,cast_J.K. Simmons,cast_John Goodman,cast_John Leguizamo,cast_Keith David,cast_Liam Neeson,cast_Matt Damon,cast_Morgan Freeman,cast_Nicolas Cage,cast_Richard Jenkins,cast_Robert De Niro,cast_Robin Williams,cast_Samuel L. Jackson,cast_Steve Buscemi,cast_Tommy Lee Jones,cast_Willem Dafoe,crew_Avy Kaufman,crew_Barbara Harris,crew_Bob Weinstein,crew_Clint Eastwood,crew_Dan O'Connell,crew_Dan Perri,crew_Deborah Aquila,crew_Gary Burritt,crew_Hans Zimmer,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_John T. Cucci,crew_Kerry Barden,crew_Luc Besson,crew_Mary Vernieu,crew_Mo Henry,crew_Nerses Gezalyan,crew_Robert Rodriguez,crew_Steven Spielberg,original_language_ab,original_language_af,original_language_ar,original_language_bn,original_language_cn,original_language_cs,original_language_da,original_language_de,original_language_el,original_language_en,original_language_es,original_language_fa,original_language_fi,original_language_fr,original_language_he,original_language_hi,original_language_hu,original_language_id,original_language_it,original_language_ja,original_language_ka,original_language_ko,original_language_mi,original_language_ml,original_language_nb,original_language_nl,original_language_no,original_language_pl,original_language_pt,original_language_ro,original_language_ru,original_language_sk,original_language_sl,original_language_sv,original_language_sw,original_language_ta,original_language_te,original_language_th,original_language_tl,original_language_tr,original_language_vi,original_language_xx,original_language_zh
0,258000000,https://www.sonypictures.com/movies/spiderman3,Spider-Man 3,The seemingly invincible Spider-Man goes up ag...,22.024,2007-05-01,890871626,139.0,Released,The battle within.,Spider-Man 3,False,6.2,8180,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,250000,,"Silent Night, Deadly Night Part 2",After being traumatized by his brother Billy's...,4.756,1987-04-10,154323,88.0,Released,The nightmare is about to begin ... AGAIN!,"Silent Night, Deadly Night Part 2",False,4.3,68,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,https://www.scottymovie.com/,Scotty and the Secret History of Hollywood,A deliciously scandalous portrait of unsung Ho...,4.746,2018-07-27,176236,98.0,Released,,Scotty and the Secret History of Hollywood,False,6.5,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1000000,,Hellraiser,An unfaithful wife encounters the zombie of he...,13.828,1987-09-11,14564027,94.0,Released,He'll tear your soul apart.,Hellraiser,False,6.9,1115,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,15000000,,National Lampoon's Vacation,Clark Griswold is on a quest to take his famil...,15.07,1983-07-28,61399552,99.0,Released,Every summer Chevy Chase takes his family on a...,National Lampoon's Vacation,False,7.1,782,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
train_reformated.to_csv(DATA_FOLDER + "train_reformated.csv",index=False)