In [18]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [19]:
df_train = pd.read_csv("train_columns_remove.csv", engine='python', header=None)
df_test = pd.read_csv("test_columns_remove.csv", engine='python', header=None)


데이터셋의 칼럼리스트는 csv에서 바로 가져왔습니다 ㅎㅅㅎ;

In [20]:
# train과 test 간의 데이터프레임 columns 수 맞춰주기 위해 일단 공갈 revenue를 df_test에 넣어줍니다

df_test.insert(len(df_test.columns), 'revenue', np.nan)

In [21]:
column_list = """id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue""".split(",")



In [22]:
df_train.columns = column_list
df_test.columns = column_list

쓸모 없는 피쳐 제거

In [23]:
df_train = df_train.drop(['id', 'belongs_to_collection', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'poster_path', 'production_companies', 'release_date', 'runtime', 'status', 'tagline', 'Keywords', 'crew'], axis=1)
df_test = df_test.drop(['id', 'belongs_to_collection', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'poster_path', 'production_companies', 'release_date', 'runtime', 'status', 'tagline', 'Keywords', 'crew'], axis=1)


In [24]:
# Test 데이터 null이 있는 행은 제거

df_train = df_train.loc[~df_train['genres'].isna()]
df_train = df_train.loc[~df_train['production_countries'].isna()]
df_train = df_train.loc[~df_train['spoken_languages'].isna()]
df_train = df_train.loc[~df_train['cast'].isna()]



# Test 데이터 null이 있는 행은 제거

df_test = df_test.loc[~df_test['genres'].isna()]
df_test = df_test.loc[~df_test['production_countries'].isna()]
df_test = df_test.loc[~df_test['spoken_languages'].isna()]
df_test = df_test.loc[~df_test['cast'].isna()]



In [25]:
# null 제거 뒤 reindex

df_train = df_train.reset_index().drop('index', axis = 1)
df_test = df_test.reset_index().drop('index', axis = 1)

In [26]:
# 데이터가 모두 스트링 타입으로 기입되어 있다. 고유 데이터 타입으로 전환

import ast

def literal_return(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val


df_train['genres'] = df_train['genres'].apply(literal_return)
df_train['production_countries'] = df_train['production_countries'].apply(literal_return)
df_train['spoken_languages'] = df_train['spoken_languages'].apply(literal_return)
df_train['cast'] = df_train['cast'].apply(literal_return)


df_test['genres'] = df_test['genres'].apply(literal_return)
df_test['production_countries'] = df_test['production_countries'].apply(literal_return)
df_test['spoken_languages'] = df_test['spoken_languages'].apply(literal_return)
df_test['cast'] = df_test['cast'].apply(literal_return)


## 피쳐 다루기

### production countries

1. dict 벗겨내서 첫 번재 국가로 변환

2. 북미, 유럽, 기타 국가로 카테고리 지정 (value_counts 를 통해 자주 나오는 국가만 임의로 취합했습니다)

In [28]:
df_train['production_countries'] = pd.Series(item[0]['iso_3166_1'] for item in df_train['production_countries'])

df_test['production_countries'] = pd.Series(item[0]['iso_3166_1'] for item in df_test['production_countries'])


In [29]:
def categrize_country(country):
    if country in (['US', 'CA']):
        return 'north_america'
    elif country in (['GB', 'FR', 'DE', 'RU', 'ES', 'IT']):  # 영 프 독 러 스페인 이탈리아
        return 'europe'
    elif country in (['IN', 'in']):
        return 'india'
    else:
        return 'etc'


# 카테고리로 구분하는 칼럼 만들기

df_train['production_counties_cate'] = df_train['production_countries'].apply(categrize_country)

df_test['production_counties_cate'] = df_test['production_countries'].apply(categrize_country)


## spoken_languages

1. production countries 칼럼과 비슷하지만 다름 (소문자 사용)
2. 절대다수 영어이기 때문에 영어와 그 외로 구분

In [30]:
df_train['spoken_languages'] = pd.Series(item[0]['iso_639_1'] for item in df_train['spoken_languages'])

df_test['spoken_languages'] = pd.Series(item[0]['iso_639_1'] for item in df_test['spoken_languages'])



def categrize_language(lan):
    if lan == 'en':
        return 'en'
    else:
        return 'etc'



# 카테고리로 구분하는 칼럼 만들기

df_train['spoken_languages_cate'] = df_train['spoken_languages'].apply(categrize_language)


df_test['spoken_languages_cate'] = df_test['spoken_languages'].apply(categrize_language)



## 장르 dict 벗겨내서 인코딩 칼럼으로 만들기

In [32]:
# 장르를 원핫인코딩 형태로 칼럼으로 전환 (더미변수)


for num, temp_list in enumerate(df_train['genres']):
    for genre_dict in temp_list:
        try:
            df_train.insert(0, f"genre_{genre_dict['name']}", np.nan)
            df_train.loc[df_train.index == num, f"genre_{genre_dict['name']}"] = 1
        except:
            pass

        
for num, temp_list in enumerate(df_test['genres']):
    for genre_dict in temp_list:
        try:
            df_test.insert(0, f"genre_{genre_dict['name']}", np.nan)
            df_test.loc[df_test.index == num, f"genre_{genre_dict['name']}"] = 1
        except:
            pass

In [33]:
# 코미디 장르인 1번 영화는 genre_Comedy 칼럼에 1.0 있고 나머진 NaN
# 장르가 여러개면 여러 칼럼에 1.0 표기됨

df_train.head(1)

Unnamed: 0,genre_TV Movie,genre_History,genre_Documentary,genre_Western,genre_War,genre_Fantasy,genre_Animation,genre_Foreign,genre_Mystery,genre_Science Fiction,genre_Crime,genre_Adventure,genre_Music,genre_Horror,genre_Action,genre_Thriller,genre_Romance,genre_Family,genre_Drama,genre_Comedy,budget,genres,popularity,production_countries,spoken_languages,title,cast,revenue,production_counties_cate,spoken_languages_cate,budget_level
0,,,,,,,,,,,,,,,,,,,,1.0,14000000,"[{'id': 35, 'name': 'Comedy'}]",6.575393,US,en,Hot Tub Time Machine 2,"[{'cast_id': 4, 'character': 'Lou', 'credit_id...",12314651,north_america,en,2


## 영화예산 (budget)으로 등급 만들기
-> 회귀에 사용해도 되지만, 캐스팅 배우를 등급 나누기 위해 만들었습니다

In [31]:
# 영화 예산을 자체적으로 3등분
# 상위 25% 는 3점 ... 이런 식으로

def budget_level(budget):
    budget_list = df_train['budget'].tolist()
    q1, q3 = np.percentile(budget_list, [25, 75])
    if budget > q3:
        return 3
    elif q1 < budget <= q3:
        return 2
    else:
        return 1

df_train['budget_level'] = df_train['budget'].apply(budget_level)

df_test['budget_level'] = df_test['budget'].apply(budget_level)


## 출연 배우 (cast) 전처리

In [35]:
# 캐스팅 배우 전처리 방식 1 - 일단 규모로 파악하기

df_train['casting_size'] = pd.Series(len(temp_list) for temp_list in df_train['cast'])

df_test['casting_size'] = pd.Series(len(temp_list) for temp_list in df_test['cast'])


캐스팅 배우 전처리 방식 2 - 비싼 영화에 출연했는지 여부로 배우 등급 먹이기


step 1. cast의 dict를 벗겨내서 주요 출연자만 5명 가량 뽑히도록 리스트를 만들어줍니다 (def main_cast)

step 2. budget level에 따라 비싼 영화에 출연한 배우 리스트, 중급 영화에 출연한 배우 리스트 등을 만듭니다


step 3. 주요 출연진이 어느 등급에 속했는지에 따라 점수를 매겨 합해 채점합니다



In [37]:

# step 1 메인 출연진 뽑아내기

def main_cast(casts):

    temp_main_cast_list = []
    
    if 5 < len(casts) < 15:
        for order in range(int(len(casts)*0.5)):
            temp_main_cast_list.append(casts[order]['name'])
    elif 15 < len(casts) < 20:
        for order in range(int(len(casts)*0.2)):
            temp_main_cast_list.append(casts[order]['name'])
    elif 20 < len(casts):
        for order in range(int(len(casts)*0.2)):
            temp_main_cast_list.append(casts[order]['name'])
    else:
        for order in range(len(casts)):
            temp_main_cast_list.append(casts[order]['name'])
    return temp_main_cast_list

df_train['main_cast'] =  df_train['cast'].apply(main_cast)

df_test['main_cast'] =  df_test['cast'].apply(main_cast)




# step 2 배우 등급 나누기

high_level_actor = []
for actor_list in df_train.loc[df_train['budget_level'] == 3, 'main_cast']:
    for actor in actor_list:
        high_level_actor.append(actor)
        
mid_level_actor = []
for actor_list in df_train.loc[df_train['budget_level'] == 2, 'main_cast']:
    for actor in actor_list:
        high_level_actor.append(actor)

low_level_actor = []
for actor_list in df_train.loc[df_train['budget_level'] == 1, 'main_cast']:
    for actor in actor_list:
        high_level_actor.append(actor)


        

In [38]:
# step 3 총점 합하기

def train_cast_point(main_cast_list):
    final_score = []
    if len(main_cast_list) == 0:
        return 0
    else:
        for actor in main_cast_list:
            if actor in high_level_actor:
                final_score.append(3)
            elif actor in mid_level_actor:
                final_score.append(2)
            else:
                final_score.append(1)

        return sum(final_score)

df_train['cast_score'] = df_train['main_cast'].apply(train_cast_point)

In [40]:
# df_test에서도 마찬가지로 진행
high_level_actor = []
for actor_list in df_test.loc[df_test['budget_level'] == 3, 'main_cast']:
    for actor in actor_list:
        high_level_actor.append(actor)
        
mid_level_actor = []
for actor_list in df_test.loc[df_test['budget_level'] == 2, 'main_cast']:
    for actor in actor_list:
        high_level_actor.append(actor)

low_level_actor = []
for actor_list in df_test.loc[df_test['budget_level'] == 1, 'main_cast']:
    for actor in actor_list:
        high_level_actor.append(actor)

def test_cast_point(main_cast_list):
    final_score = []
    if len(main_cast_list) == 0:
        return 0
    else:
        for actor in main_cast_list:
            if actor in high_level_actor:
                final_score.append(3)
            elif actor in mid_level_actor:
                final_score.append(2)
            else:
                final_score.append(1)

        return sum(final_score)



df_test['cast_score'] = df_test['main_cast'].apply(test_cast_point)
df_test = df_test.drop(['genres', 'title', 'cast', 'spoken_languages', 'production_countries'], axis = 1)

## 쓸모없어진 칼럼 제거

In [41]:
df_train = df_train.drop(['genres', 'title', 'cast', 'spoken_languages', 'production_countries'], axis = 1)

In [42]:
# revenue 칼럼이 타겟으로 가야하는데 어중간한 위치에 속해있어서 마지막 칼럼으로 옮겨주는 작업

df_train.columns[22]

'revenue'

In [43]:
cols = df_train.columns.tolist()
cols = cols[:21] + cols[23:] + [cols[22]]
df_train = df_train[cols]

In [44]:
# 공갈로 만들어둔 df_test의 revenue도 이제 제거

df_test = df_test.drop('revenue', axis=1)

In [45]:
df_train.head()

Unnamed: 0,genre_TV Movie,genre_History,genre_Documentary,genre_Western,genre_War,genre_Fantasy,genre_Animation,genre_Foreign,genre_Mystery,genre_Science Fiction,genre_Crime,genre_Adventure,genre_Music,genre_Horror,genre_Action,genre_Thriller,genre_Romance,genre_Family,genre_Drama,genre_Comedy,budget,production_counties_cate,spoken_languages_cate,budget_level,casting_size,main_cast,cast_score,revenue
0,,,,,,,,,,,,,,,,,,,,1.0,14000000,north_america,en,2,24,"[Rob Corddry, Craig Robinson, Clark Duke, Adam...",12,12314651
1,,,,,,,,,,,,,,,,,1.0,1.0,1.0,,40000000,north_america,en,3,20,"[Anne Hathaway, Julie Andrews, H√©ctor Elizond...",60,95149435
2,,,,,,,,,,,,,,,,,,,,,3300000,north_america,en,2,51,"[Miles Teller, J.K. Simmons, Melissa Benoist, ...",30,13092000
3,,,,,,,,,,,,,,,,1.0,,,,,1200000,india,en,2,7,"[Vidya Balan, Nawazuddin Siddiqui, Parambrata ...",9,16000000
4,,,,,,,,,,,,,,,1.0,,,,,,0,etc,etc,1,4,"[Kim Kang-woo, Jo Jae-hyeon, Park Si-yeon, Kim...",12,3923970


In [46]:
df_test.head()

Unnamed: 0,genre_Foreign,genre_Western,genre_Crime,genre_Music,genre_Action,genre_Thriller,genre_Documentary,genre_History,genre_Mystery,genre_War,genre_Drama,genre_Romance,genre_Comedy,genre_Science Fiction,genre_Horror,genre_Fantasy,genre_Family,genre_Animation,genre_Adventure,budget,popularity,production_counties_cate,spoken_languages_cate,budget_level,casting_size,main_cast,cast_score
0,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0,3.851534,etc,en,1,7,"[Craig Blair, Emily Bauer, Sarah Natochenny]",9
1,,,,,,,,,,,,,,1.0,1.0,,,,,88000,3.559789,north_america,en,2,10,"[Allison Hayes, William Hudson, Yvette Vickers...",15
2,,,,,,,,,,,,1.0,1.0,,,,,,,0,8.085194,north_america,en,1,9,"[Meg Ryan, Matthew Broderick, Kelly Preston, T...",12
3,,,,,,,,,1.0,1.0,1.0,,,,,,,,,6800000,8.596012,north_america,etc,2,23,"[Lubna Azabal, M√©lissa D√©sormeaux-Poulin, Ma...",12
4,,,,,,,1.0,1.0,,,,,,,,,,,,2000000,3.21768,north_america,en,2,4,"[Dennis Hopper, Peter Bart, Warren Beatty, Car...",12
