In [19]:
import pandas as pd
import dateutil.easter as easter
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [20]:
#读取训练集测试集
train = pd.read_csv("C:\\Users\\hp\\Desktop\\kaggle_train.csv")
test = pd.read_csv("C:\\Users\\hp\\Desktop\\kaggle_test.csv")

In [21]:
def data_process(df):
    
    #转换日期格式
    df["date"] = pd.to_datetime(df["date"])
    
    
    #取出年、月、日、每周的第几天
    df["year"] =  df["date"].dt.year
    df["month"]=df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.weekday
    
    #添加人均GDP
    GDP_PC = pd.read_csv("C:\\Users\\hp\\Desktop\\GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv", index_col = "year")
    GDP_PC_dict = GDP_PC.unstack().to_dict()
    df["GDP_PC"] = df.set_index(["country", "year"]).index.map(GDP_PC_dict)
    
    #添加雪的深度
    snow = pd.read_csv("C:\\Users\\hp\\Desktop\\nordics_weather.csv")
    snow["date"] = pd.to_datetime(snow["date"], format="%m/%d/%Y")
    snow["year"] = snow['date'].apply(lambda x: x.year)
    snow["month"] = snow['date'].apply(lambda x: x.month)
    snow["day"] = snow['date'].apply(lambda x: x.day)
    snow = snow[["country", "year", "month", "day", "snow_depth"]]
    snow_map = snow.set_index(['country', 'year', 'month', "day"]).to_dict()["snow_depth"]
    df['snow_depth'] = df.set_index(["country", "year", "month", "day"]).index.map(snow_map)
    
    #考虑圣诞节的影响
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    df["xmas_adjust"] = (df.date - xmas_date).dt.days.clip(lower=-20,upper=6)
    
    #考虑复活节的影响
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df["easter_adj"]= (df.date - easter_date).dt.days.clip(lower =-3,upper = 60)
    df.loc[df['easter_adj'].isin(range(12, 39)), 'easter_adj'] = 12 
    
    #Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(("2015-06-24")),
                                         2016: pd.Timestamp(("2016-06-29")),
                                         2017: pd.Timestamp(("2017-06-28")),
                                         2018: pd.Timestamp(("2018-06-27")),
                                         2019: pd.Timestamp(("2019-06-26"))})
    df["days_from_wed_jun"] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(("2015-11-1")),
                                         2016: pd.Timestamp(("2016-11-6")),
                                         2017: pd.Timestamp(("2017-11-5")),
                                         2018: pd.Timestamp(("2018-11-4")),
                                         2019: pd.Timestamp(("2019-11-3"))})
    df["days_from_sun_nov"] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    #添加三个国家的地方节日
    holiday_df = pd.read_csv("C:\\Users\\hp\\Desktop\\holidays.csv")
    fin_holiday = holiday_df.loc[holiday_df.country == 'Finland']
    swe_holiday = holiday_df.loc[holiday_df.country == 'Sweden']
    nor_holiday = holiday_df.loc[holiday_df.country == 'Norway']
    
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)

In [22]:
data_process(train)
data_process(test)

In [23]:
response_label = "num_sold"
drop_label = ["row_id", "date"]
train_x = train.drop([response_label, *drop_label], axis=1)
test_x = test.drop([*drop_label], axis=1)
train_y = train[response_label]


for c in ["country", "store", "product"]:
    le = LabelEncoder()

    train_x[c] = le.fit_transform(train_x[c])
    test_x[c] = le.fit_transform(test_x[c])

In [24]:
train_x.head()

Unnamed: 0,country,store,product,year,month,day,weekday,GDP_PC,snow_depth,xmas_adjust,easter_adj,days_from_wed_jun,days_from_sun_nov,holiday
0,0,0,1,2015,1,1,3,42802,284.545455,-20,-3,-5,-1,1
1,0,0,0,2015,1,1,3,42802,284.545455,-20,-3,-5,-1,1
2,0,0,2,2015,1,1,3,42802,284.545455,-20,-3,-5,-1,1
3,0,1,1,2015,1,1,3,42802,284.545455,-20,-3,-5,-1,1
4,0,1,0,2015,1,1,3,42802,284.545455,-20,-3,-5,-1,1


In [25]:
#标签取对数
train_y = np.log(train_y)