In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from pandas_profiling import ProfileReport # profile report を作る用

In [2]:
INPUT_DIR = 'atmacup10_dataset'
OUTPUT_DIR = 'outputs'

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
color_df = pd.read_csv(os.path.join(INPUT_DIR, "color.csv"))
historical_person_df = pd.read_csv(os.path.join(INPUT_DIR, "historical_person.csv"))
maker_df = pd.read_csv(os.path.join(INPUT_DIR, "maker.csv"))
material_df = pd.read_csv(os.path.join(INPUT_DIR, "material.csv"))
object_collection_df = pd.read_csv(os.path.join(INPUT_DIR, "object_collection.csv"))
palette_df = pd.read_csv(os.path.join(INPUT_DIR, "palette.csv"))
principal_maker_df = pd.read_csv(os.path.join(INPUT_DIR, "principal_maker.csv"))
principal_maker_occupation_df = pd.read_csv(os.path.join(INPUT_DIR, "principal_maker_occupation.csv"))
production_place_df = pd.read_csv(os.path.join(INPUT_DIR, "production_place.csv"))
technique_df = pd.read_csv(os.path.join(INPUT_DIR, "technique.csv"))

In [4]:
type(train_df['object_id'][0])

str

In [9]:
main_makers_list = pd.DataFrame(train_df['principal_maker'].value_counts())  # anonymous == unknown
main_makers_list.rename(columns={'principal_maker': 'main_object_count'}, inplace=True)
main_makers_list['name'] = main_makers_list.index
main_makers_list.replace(1692, np.nan, inplace=True)

sub_makers_list = pd.DataFrame(train_df['principal_or_first_maker'].value_counts())  # anonymous == unknown
sub_makers_list.rename(columns={'principal_or_first_maker': 'sub_object_count'}, inplace=True)
sub_makers_list['name'] = sub_makers_list.index
sub_makers_list.replace(1650, np.nan, inplace=True)

train_df = pd.merge(train_df, main_makers_list, left_on='principal_maker', right_on='name', how='left')
train_df = pd.merge(train_df, sub_makers_list, left_on='principal_or_first_maker', right_on='name', how='left')

In [10]:
# train_df.query("likes < 20")[["desc_len", "likes"]]
train_df['desc_len'] = train_df['description'].str.len()

In [11]:
from fasttext import load_model

model = load_model("bin/lid.176.bin")

train_df["title_lang_ft"] = train_df["title"].fillna("").map(lambda x: model.predict(x.replace("\n", ""))[0][0])



In [12]:
from PIL import ImageColor
import colorsys

In [7]:
tmp = pd.DataFrame(color_df['hex'].str.strip().map(ImageColor.getrgb).values.tolist(), columns=['R', 'G', 'B'])
color_df_add_rgb = pd.concat([color_df, tmp], axis=1)

function = lambda x: colorsys.rgb_to_hsv(x[3], x[4], x[5])
tmp = color_df_add_rgb.apply(function, axis=1)
tmp = pd.DataFrame(list(tmp), columns=['H', 'S', 'V'])
color_df_rgb_hsv = pd.concat([color_df_add_rgb, tmp], axis=1)

train_df = pd.merge(train_df, color_df_rgb_hsv, on='object_id')

直接は不可  
color_df_add_rgb = pd.concat([color_df, pd.DataFrame(color_df['hex'].str.strip().map(ImageColor.getrgb).values.tolist(), columns=['R', 'G', 'B'])], axis=1)

In [13]:
palette_df['percentage'] = palette_df['ratio'] * 100

function = lambda x: colorsys.rgb_to_hsv(x[1], x[2], x[3])
tmp = palette_df.apply(function, axis=1)
tmp = pd.DataFrame(list(tmp), columns=['color_H', 'color_S', 'color_V'])
palette_df_add_hsv = pd.concat([palette_df, tmp], axis=1)

train_df = pd.merge(train_df, palette_df_add_hsv, on='object_id')

In [14]:
from geopy.geocoders import Nominatim
import numpy as np

def place2country(address):
    geolocator = Nominatim(user_agent='sample', timeout=200)
    loc = geolocator.geocode(address, language='en')
    coordinates = (loc.latitude, loc.longitude)
    location = geolocator.reverse(coordinates, language='en')
    country = location.raw['address']['country']
    return country

In [15]:
from tqdm import tqdm_notebook as tqdm

place_list = production_place_df['name'].unique()
country_dict = {}
for place in tqdm(place_list):
    try:
        country = place2country(place)
        country_dict[place] = country
    except:
        # 国名を取得できない場合はnan
        country_dict[place] = np.nan

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/144 [00:00<?, ?it/s]

In [16]:
production_place_df['country_name'] = production_place_df['name'].map(country_dict)

train_df = pd.merge(train_df, production_place_df, on='object_id')

In [17]:
train_df = train_df[['object_id', 'art_series_id', 'dating_sorting_date', 'dating_year_early', 'dating_year_late', 'likes', 'main_object_count', 'sub_object_count',
                    'desc_len', 'title_lang_ft', 'ratio', 'color_r', 'color_g', 'color_b', 'percentage', 'color_H', 'color_S', 'color_V', 'name']].drop_duplicates(subset=['object_id', 'art_series_id', 'dating_sorting_date', 'dating_year_early', 'dating_year_late', 'likes', 'main_object_count', 'sub_object_count',
                    'desc_len', 'title_lang_ft', 'ratio', 'color_r', 'color_g', 'color_b', 'percentage', 'color_H', 'color_S', 'color_V', 'name'])

In [18]:
train_df = pd.get_dummies(train_df, columns=['name'], drop_first=True)
train_df = pd.get_dummies(train_df, columns=['title_lang_ft'], drop_first=True)

In [3]:
function = lambda x: int(x)
train_df['object_id'] = train_df['object_id'].apply(function)

ValueError: invalid literal for int() with base 10: '0011d6be41612ec9eae3'

In [23]:
train_df.head(3).T # len = 12026

Unnamed: 0,0,1,2
object_id,<map object at 0x0000021AC7FC7DA0>,<map object at 0x0000021AC7FC7DA0>,<map object at 0x0000021AC7FC7DA0>
art_series_id,95c14fb11c54281ad7e0,95c14fb11c54281ad7e0,95c14fb11c54281ad7e0
dating_sorting_date,1900.0,1900.0,1900.0
dating_year_early,1900.0,1900.0,1900.0
dating_year_late,1930.0,1930.0,1930.0
...,...,...,...
title_lang_ft___label__sk,0,0,0
title_lang_ft___label__sl,0,0,0
title_lang_ft___label__sq,0,0,0
title_lang_ft___label__sv,0,0,0


In [19]:
len(train_df)

194170