In [1]:
import pandas as pd
from datetime import datetime

In [2]:
# CITY = 'beijing'
CITY = 'shanghai'

In [3]:
room_df = pd.read_csv('./data/{}/room.csv.bak'.format(CITY))
host_df = pd.read_csv('./data/{}/host.csv.bak'.format(CITY))

In [4]:
def stats_income(room_df: pd.DataFrame, raw_host_df: pd.DataFrame):
    host_df = raw_host_df.copy(deep=True)
    for host_id in host_df['host_id']:
        temp_room_df = room_df[room_df['host_id']==host_id]
        room_cnt = len(temp_room_df)
        total_income = (temp_room_df['price'] * temp_room_df['number_of_reviews']).sum()
        time_span = (datetime(2021,2,22) - pd.to_datetime(host_df[host_df['host_id']==host_id]['host_since'].values[0])).days
        host_df.loc[host_df['host_id']==host_id, 'room_count'] = room_cnt
        host_df.loc[host_df['host_id']==host_id,'income'] = total_income
        host_df.loc[host_df['host_id']==host_id,'daily_income'] = total_income / time_span
        host_df.loc[host_df['host_id']==host_id,'ave_room_income'] = total_income / (time_span*room_cnt)
    return host_df

In [5]:
def tag_host(raw_host_df:pd.DataFrame):
    host_df = raw_host_df.copy(deep=True)
    host_df['is_single']=host_df['room_count'].map(lambda x: 't' if x < 5 else 'f')

    best_room_df=pd.read_csv('./data/beijing/best_room.csv')
    best_host_id_list = best_room_df['host_id'].values.tolist()
    host_df['is_best'] = host_df['host_id'].map(lambda x: 't' if x in best_host_id_list else 'f')
    return host_df

In [6]:
def format_host(raw_host_df:pd.DataFrame):
    host_df = raw_host_df.copy(deep=True)
    host_df['host_response_rate'] = host_df['host_response_rate'].map(lambda x: x if pd.isna(x) else round(float(x[:-1])/100,2))
    host_df = host_df.fillna(round(host_df.mean(), 2))
    host_df = host_df.fillna({'host_identity_verified': 'f', 'host_name':'_', 'host_has_profile_pic': 'f'})
    host_df[['room_count']]=host_df[['room_count']].astype(int)
    for host_id in host_df[pd.isna(host_df['host_since'])]['host_id']:
        host_df.loc[host_df['host_id']==host_id, 'host_since']=room_df[room_df['host_id']==host_id].sort_values(by='first_review')['first_review'].values[0]
    return host_df

In [7]:
def preprocess_host(room_df: pd.DataFrame, raw_host_df: pd.DataFrame):
    host_df = raw_host_df.copy(deep=True)
    host_df = stats_income(room_df, host_df)
    host_df = tag_host(host_df)
    host_df = format_host(host_df)
    return host_df

In [8]:
host_df = preprocess_host(room_df, host_df)

In [9]:
host_df.to_csv('./data/{}/host.csv'.format(CITY), index=False)

In [10]:
host_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4865 entries, 0 to 4864
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   host_id                 4865 non-null   int64  
 1   host_name               4865 non-null   object 
 2   host_since              4865 non-null   object 
 3   host_has_profile_pic    4865 non-null   object 
 4   host_identity_verified  4865 non-null   object 
 5   host_response_rate      4865 non-null   float64
 6   room_count              4865 non-null   int32  
 7   income                  4865 non-null   float64
 8   daily_income            4865 non-null   float64
 9   ave_room_income         4865 non-null   float64
 10  is_single               4865 non-null   object 
 11  is_best                 4865 non-null   object 
dtypes: float64(4), int32(1), int64(1), object(6)
memory usage: 437.2+ KB
