Data source: https://www.kaggle.com/c/expedia-hotel-recommendations/data

In [2]:
import pandas as pd
train = pd.read_csv("/Users/leosu/Desktop/BD_Final Project/expedia-hotel-recommendations/train.csv")

In [3]:
destinations = pd.read_csv("/Users/leosu/Desktop/BD_Final Project/expedia-hotel-recommendations/destinations.csv")
test = pd.read_csv("/Users/leosu/Desktop/BD_Final Project/expedia-hotel-recommendations/test.csv")

In [4]:
test_id = test["id"] 

In [5]:
test = test.drop(columns="id")

In [6]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [7]:
test["date_time"] = pd.to_datetime(test["date_time"])
test["year"] = test["date_time"].dt.year
test["month"] = test["date_time"].dt.month

In [8]:
test['is_booking']=1 #在 test 新增加 is_booking 這一行，並且都設為1

In [9]:
import random

unique_user_id = set(train.user_id.unique())
random.seed(50)
sel_user_id = random.sample(unique_user_id,10000) #選10000個不同的user_id
sel_train = train[train.user_id.isin(sel_user_id)]

In [10]:
# t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
# t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

In [11]:
# t1.shape

In [12]:
# t2 = t2[t2.is_booking == True]

## PCA (destination)

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3) #只留下三個主成份
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [14]:
def calc_fast_features(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")

    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)

    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]

    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')

    ret = pd.DataFrame(props)

    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

In [15]:
df_train = calc_fast_features(sel_train)
df_train.fillna(-1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [16]:
df_train = df_train.drop(columns=['cnt']) #把train裡的cnt這個column拿掉
df_train.shape

(320954, 39)

In [17]:
df_test = calc_fast_features(test)
df_test.fillna(-1, inplace=True)

In [18]:
predictors = [c for c in df_train.columns if c not in ["hotel_cluster"]]

## Making predictions based on ......

In [72]:
def make_key(items):
    return "_".join([str(i) for i in items])

match_cols = ['user_location_region']
cluster_cols = match_cols + ['hotel_cluster']
groups = sel_train.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])

    score = bookings + .15 * clicks

    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score

In [79]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

In [None]:
test_preds = []
for index, row in test.iterrows():
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        test_preds.append(cluster_dict[key])
    else:
        test_preds.append([])

### Write to file

In [None]:
test = pd.read_csv("/Users/leosu/Desktop/BD_Final Project/expedia-hotel-recommendations/test.csv")

In [None]:
write_p = [" ".join([str(l) for l in p]) for p in test_preds]
write_frame = ["{0},{1}".format(test["id"][i], write_p[i]) for i in range(len(test_preds))]
write_frame = ["id,hotel_cluster"] + write_frame

In [None]:
import os
import time
with open('submission_'+time.strftime("%Y-%m-%d %H%M%S", time.localtime())+'.csv', "w+") as f:
    f.write("\n".join(write_frame))