In [1]:
import numpy as np # linear algebra
import json
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None  # default='warn'
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
with open('/Users/apple1/desktop/kaggle/RentalListing/train.json') as data_file:    
    train_df = pd.read_json(data_file)
with open('/Users/apple1/desktop/kaggle/RentalListing/test.json') as data_file:    
    test_df = pd.read_json(data_file)

In [3]:
import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

train_df, test_df = operate_on_coordinates(train_df, test_df)

In [4]:
import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
   
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    
    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    #and... can we call them?
    
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)

In [5]:
list(train_df)

['bathrooms',
 'bedrooms',
 'building_id',
 'created',
 'description',
 'display_address',
 'features',
 'interest_level',
 'latitude',
 'listing_id',
 'longitude',
 'manager_id',
 'photos',
 'price',
 'street_address',
 'num_rho',
 'num_phi',
 'num_rot15_X',
 'num_rot15_Y',
 'num_rot30_X',
 'num_rot30_Y',
 'num_rot45_X',
 'num_rot45_Y',
 'num_rot60_X',
 'num_rot60_Y',
 'num_cap_share',
 'num_nr_of_lines',
 'num_redacted',
 'num_email',
 'num_phone_nr']

In [6]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=1800):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.021
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model


In [7]:
test_df["bathrooms"].loc[19671] = 1.5
test_df["bathrooms"].loc[22977] = 2.0
test_df["bathrooms"].loc[63719] = 2.0
train_df["price"] = train_df["price"].clip(upper=13000)

In [8]:
train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])

In [9]:
train_df['half_bathrooms'] = train_df["bathrooms"] - train_df["bathrooms"].apply(int)#.astype(int) # Half bathrooms? 1.5, 2.5, 3.5...
test_df['half_bathrooms'] = test_df["bathrooms"] - test_df["bathrooms"].apply(int)#.astype(int) # Half bathrooms? 

In [10]:
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

#train_df["created_weekday"] = train_df["created"].dt.weekday
#test_df["created_weekday"] = test_df["created"].dt.weekday
#train_df["created_week"] = train_df["created"].dt.week
#test_df["created_week"] = test_df["created"].dt.week

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

In [11]:
#train_df["bedBathDiff"] = train_df['bedrooms'] - train_df['bathrooms']
#test_df["bedBathDiff"] = test_df['bedrooms'] - test_df['bathrooms']

#train_df["bedsPerc"] = train_df["bedrooms"] / train_df["room_sum"]
#test_df["bedsPerc"] = test_df["bedrooms"] / test_df["room_sum"]

In [12]:
#bedroom*bathroom
train_df["bed_bath"] = train_df["bedrooms"]*train_df["bathrooms"]
test_df["bed_bath"] = test_df["bedrooms"]*test_df["bathrooms"]

#bedroom*price
train_df["bed_price"] = train_df["bedrooms"]*train_df["price"]
test_df["bed_price"] = test_df["bedrooms"]*test_df["price"]

#bathroom*price
train_df["bath_price"] = train_df["bathrooms"]*train_df["price"]
test_df["bath_price"] = test_df["bathrooms"]*test_df["price"]

#bathroom*photos
train_df["bath_photo"] = train_df["bathrooms"]*train_df["num_photos"]
test_df["bath_photo"] = test_df["bathrooms"]*test_df["num_photos"]

#bedroom*photos
train_df["bed_photo"] = train_df["bedrooms"]*train_df["num_photos"]
test_df["bed_photo"] = test_df["bedrooms"]*test_df["num_photos"]

#price*photos
train_df["price_photo"] = train_df["price"]*train_df["num_photos"]
test_df["price_photo"] = test_df["price"]*test_df["num_photos"]

#price*features
train_df["price_feat"] = train_df["price"]*train_df["num_features"]
test_df["price_feat"] = test_df["price"]*test_df["num_features"]

#bed*features
train_df["bed_feat"] = train_df["bedrooms"]*train_df["num_features"]
test_df["bed_feat"] = test_df["bedrooms"]*test_df["num_features"]

#bath*features
train_df["bath_feat"] = train_df["bathrooms"]*train_df["num_features"]
test_df["bath_feat"] = test_df["bathrooms"]*test_df["num_features"]

#bed*features
train_df["bed_desc"] = train_df["bedrooms"]*train_df["num_description_words"]
test_df["bed_desc"] = test_df["bedrooms"]*test_df["num_description_words"]

#bath*features
train_df["bath_desc"] = train_df["bathrooms"]*train_df["num_description_words"]
test_df["bath_desc"] = test_df["bathrooms"]*test_df["num_description_words"]

#price*features
train_df["price_desc"] = train_df["price"]*train_df["num_description_words"]
test_df["price_desc"] = test_df["price"]*test_df["num_description_words"]

In [13]:
train_df["listing_id2"] = train_df["listing_id"] - 68119576.0
test_df["listing_id2"] =  test_df["listing_id"] - 68119576.0

train_df["total_days"] =   (train_df["created_month"] -4.0)*30 + train_df["created_day"] +  train_df["created_hour"] /25.0
test_df["total_days"] =(test_df["created_month"] -4.0)*30 + test_df["created_day"] +  test_df["created_hour"] /25.0        
train_df["diff_rank"]= train_df["total_days"]/train_df["listing_id2"]
test_df["diff_rank"]= test_df["total_days"]/test_df["listing_id2"]

In [14]:
image_date = pd.read_csv("/Users/apple1/desktop/kaggle/RentalListing/listing_image_time.csv")

# rename columns so you can join tables later on
image_date.columns = ["listing_id", "time_stamp"]

# reassign the only one timestamp from April, all others from Oct/Nov
image_date.loc[80240,"time_stamp"] = 1478129766 

image_date["img_date"]                  = pd.to_datetime(image_date["time_stamp"], unit="s")
image_date["img_days_passed"]           = (image_date["img_date"].max() - image_date["img_date"]).astype("timedelta64[D]").astype(int)
image_date["img_date_month"]            = image_date["img_date"].dt.month
image_date["img_date_week"]             = image_date["img_date"].dt.week
image_date["img_date_day"]              = image_date["img_date"].dt.day
image_date["img_date_dayofweek"]        = image_date["img_date"].dt.dayofweek
image_date["img_date_dayofyear"]        = image_date["img_date"].dt.dayofyear
image_date["img_date_hour"]             = image_date["img_date"].dt.hour
image_date["img_date_monthBeginMidEnd"] = image_date["img_date_day"].apply(lambda x: 1 if x<10 else 2 if x<20 else 3)

train_df = pd.merge(train_df, image_date, on="listing_id", how="left")
test_df = pd.merge(test_df, image_date, on="listing_id", how="left")

In [15]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density", "half_bathrooms",
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour",#"bedBathDiff","bedsPerc",
"img_days_passed","img_date_month","img_date_week","img_date_day","img_date_dayofweek","img_date_dayofyear",
"img_date_hour","img_date_monthBeginMidEnd","bed_bath","bed_price","bath_price","bed_photo","price_photo","price_feat","bed_feat","bath_feat",
"bath_desc","price_desc","total_days","diff_rank","num_rho","num_phi","num_rot15_X", 'num_rot15_Y','num_rot30_X','num_rot30_Y',
'num_rot45_X','num_rot45_Y','num_rot60_X','num_rot60_Y','num_cap_share','num_nr_of_lines','num_redacted','num_email','num_phone_nr']

In [16]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

In [17]:
for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])

In [18]:
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

In [19]:
a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

In [20]:
features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [21]:
categorical = ["street_address", "display_address", "manager_id", "building_id"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [22]:
with open('/Users/apple1/desktop/kaggle/RentalListing/train.json') as data_file:    
    train_df2 = pd.read_json(data_file)
with open('/Users/apple1/desktop/kaggle/RentalListing/test.json') as data_file:    
    test_df2 = pd.read_json(data_file)
train_df2['features'] = train_df2["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df2['features'] = test_df2["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))


In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

def description_sentiment(sentences):
    analyzer = SentimentIntensityAnalyzer()
    result = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        result.append(vs)
    return pd.DataFrame(result).mean()

In [25]:
sdf = train_df
sdf['description_tokens'] = sdf['description'].apply(sent_tokenize)
sdf = pd.concat([sdf,sdf['description_tokens'].apply(description_sentiment)],axis=1)

In [26]:
sdf_test = test_df
sdf_test['description_tokens'] = sdf_test['description'].apply(sent_tokenize)
sdf_test = pd.concat([sdf_test,sdf_test['description_tokens'].apply(description_sentiment)],axis=1)

In [27]:
#sdf['pos']= np.log(sdf['pos'])
#sdf_test['pos']= np.log(sdf_test['pos'])
#sdf['neg']= np.log(sdf['neg'])
#sdf_test['neg']= np.log(sdf_test['neg'])
#sdf['compound']= np.log(sdf['compound'])
#sdf_test['compound']= np.log(sdf_test['compound'])
#sdf['neu']= np.log(sdf['neu'])
#sdf_test['neu']= np.log(sdf_test['neu'])

train_df = sdf
test_df = sdf_test
features_to_use.append("compound")
features_to_use.append("neu")
features_to_use.append("neg")
#features_to_use.append("pos")

In [80]:
bows = { "dogs": ("dogs", "dog"), "cats": ("cats","cat"), "nofee": ("no fee", "no-fee", "no fee", "nofee", "no_fee"), "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"), "exclusive": ("Exclusive"), "parquet": ("parquet", "hardwood"), "concierge": ("concierge", "doorman", "housekeep", "in_super"), "prewar": ("prewar", "pre_war", "pre war", "pre-war"), "laundry": ("laundry", "lndry" ,"Laundry in Unit"), "health": ("health", "gym", "fitness", "training"), "transport": ("train", "subway", "transport"), "parking": ("parking",), "utilities": ("utilities", "heat water", "water included" ,"Dishwasher"), "elevator":("Elevator") } 

In [93]:
bows

{'cats': ('cats', 'cat'),
 'concierge': ('concierge', 'doorman', 'housekeep', 'in_super'),
 'dogs': ('dogs', 'dog'),
 'elevator': 'Elevator',
 'exclusive': 'Exclusive',
 'health': ('health', 'gym', 'fitness', 'training'),
 'laundry': ('laundry', 'lndry', 'Laundry in Unit'),
 'lowfee': ('reduced_fee', 'low_fee', 'reduced fee', 'low fee'),
 'nofee': ('no fee', 'no-fee', 'no fee', 'nofee', 'no_fee'),
 'parking': ('parking',),
 'parquet': ('parquet', 'hardwood'),
 'prewar': ('prewar', 'pre_war', 'pre war', 'pre-war'),
 'transport': ('train', 'subway', 'transport'),
 'utilities': ('utilities', 'heat water', 'water included', 'Dishwasher')}

In [98]:
train_df["features"][1]

['Doorman', 'Elevator', 'Fitness Center', 'Cats Allowed', 'Dogs Allowed']

In [99]:
train_df.shape[0]

49352

In [102]:
for i in range(3):
    for feat in train_df["features"][i]:
        print(feat)

Doorman
Elevator
Fitness Center
Cats Allowed
Dogs Allowed
Laundry In Building
Dishwasher
Hardwood Floors
Pets Allowed Case by Case


In [103]:
bows

{'cats': ('cats', 'cat'),
 'concierge': ('concierge', 'doorman', 'housekeep', 'in_super'),
 'dogs': ('dogs', 'dog'),
 'elevator': 'Elevator',
 'exclusive': 'Exclusive',
 'health': ('health', 'gym', 'fitness', 'training'),
 'laundry': ('laundry', 'lndry', 'Laundry in Unit'),
 'lowfee': ('reduced_fee', 'low_fee', 'reduced fee', 'low fee'),
 'nofee': ('no fee', 'no-fee', 'no fee', 'nofee', 'no_fee'),
 'parking': ('parking',),
 'parquet': ('parquet', 'hardwood'),
 'prewar': ('prewar', 'pre_war', 'pre war', 'pre-war'),
 'transport': ('train', 'subway', 'transport'),
 'utilities': ('utilities', 'heat water', 'water included', 'Dishwasher')}

In [105]:
bows['cats']

('cats', 'cat')

In [107]:
train_df["cats"]=0
train_df["concierge"]=0
train_df["dogs"]=0
train_df["elevator"]=0
train_df["health"]=0
train_df["laundry"]=0
train_df["lowfee"]=0
train_df["nofee"]=0
train_df["parking"]=0
train_df["parquet"]=0
train_df["prewar"]=0
train_df["transport"]=0
train_df["utilities"]=0

In [108]:
for i in range(train_df.shape[0]):
    for feat in train_df["features"][i]:
        if feat in bows['cats']:
            train_df["cats"][i]=1
        if feat in bows["concierge"]:
            train_df["concierge"]=1
        if feat in bows['dogs']:
            train_df["dogs"][i]=1
        if feat in bows["elevator"]:
            train_df["elevator"]=1
        if feat in bows['health']:
            train_df["health"][i]=1
        if feat in bows["laundry"]:
            train_df["laundry"]=1
        if feat in bows['lowfee']:
            train_df["lowfee"][i]=1
        if feat in bows["nofee"]:
            train_df["nofee"]=1
        if feat in bows['parking']:
            train_df["parking"][i]=1
        if feat in bows["parquet"]:
            train_df["parquet"]=1
        if feat in bows['prewar']:
            train_df["prewar"][i]=1
        if feat in bows["transport"]:
            train_df["transport"]=1
        if feat in bows["utilities"]:
            train_df["utilities"]=1
            

In [128]:
test_df["cats"]=0
test_df["concierge"]=0
test_df["dogs"]=0
test_df["elevator"]=0
test_df["health"]=0
test_df["laundry"]=0
test_df["lowfee"]=0
test_df["nofee"]=0
test_df["parking"]=0
test_df["parquet"]=0
test_df["prewar"]=0
test_df["transport"]=0
test_df["utilities"]=0

In [130]:
for i in range(test_df.shape[0]):
    for feat in test_df["features"][i]:
        if feat in bows['cats']:
            test_df["cats"][i]=1
        if feat in bows["concierge"]:
            test_df["concierge"]=1
        if feat in bows['dogs']:
            test_df["dogs"][i]=1
        if feat in bows["elevator"]:
            test_df["elevator"]=1
        if feat in bows['health']:
            test_df["health"][i]=1
        if feat in bows["laundry"]:
            test_df["laundry"]=1
        if feat in bows['lowfee']:
            test_df["lowfee"][i]=1
        if feat in bows["nofee"]:
            test_df["nofee"]=1
        if feat in bows['parking']:
            test_df["parking"][i]=1
        if feat in bows["parquet"]:
            test_df["parquet"]=1
        if feat in bows['prewar']:
            test_df["prewar"][i]=1
        if feat in bows["transport"]:
            test_df["transport"]=1
        if feat in bows["utilities"]:
            test_df["utilities"]=1
            

In [None]:
features_to_use.append('cats')
features_to_use.append('concierge')
features_to_use.append('dogs')
features_to_use.append('elevator')
features_to_use.append('health')
features_to_use.append('laundry')
features_to_use.append('lowfee')
features_to_use.append('nofee')
features_to_use.append('parking')
features_to_use.append('parquet')
features_to_use.append('prewar')
features_to_use.append('transport')
features_to_use.append('utilities')

In [91]:
import math
R = 6373.0
location_dict = {
'manhatten_loc' : [40.7527, -73.9943],
'brooklyn_loc' : [45.0761,-73.9442],
'bronx_loc' : [40.8448,-73.8648],
'queens_loc' : [40.7282,-73.7949],
'staten_loc' : [40.5795,-74.1502]}

for location in location_dict.keys():

    lat1 = train_df['latitude'].apply(math.radians)
    lon1 = train_df['longitude'].apply(math.radians)
    lat2 = math.radians(location_dict[location][0])
    lon2 = math.radians(location_dict[location][1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    def power(x):
        return x**2

    a = (dlat/2).apply(math.sin).apply(power) + lat1.apply(math.cos) * math.cos(lat2) * (dlon/2).apply(math.sin).apply(power)
    c = 2 * a.apply(math.sqrt).apply(math.sin)

    ### Add a new column called distance
    train_df['distance_' + location] = R * c
    features_to_use.append('distance_' + location)


In [100]:
import math
R = 6373.0
location_dict = {
'manhatten_loc' : [40.7527, -73.9943],
'brooklyn_loc' : [45.0761,-73.9442],
'bronx_loc' : [40.8448,-73.8648],
'queens_loc' : [40.7282,-73.7949],
'staten_loc' : [40.5795,-74.1502]}

for location in location_dict.keys():

    lat1 = test_df['latitude'].apply(math.radians)
    lon1 = test_df['longitude'].apply(math.radians)
    lat2 = math.radians(location_dict[location][0])
    lon2 = math.radians(location_dict[location][1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    def power(x):
        return x**2

    a = (dlat/2).apply(math.sin).apply(power) + lat1.apply(math.cos) * math.cos(lat2) * (dlon/2).apply(math.sin).apply(power)
    c = 2 * a.apply(math.sqrt).apply(math.sin)

    ### Add a new column called distance
    test_df['distance_' + location] = R * c

In [62]:
import reverse_geocoder as rg
lat_lon = []
listings = []

for i, j in train_df.iterrows():
    lat_lon.append((j["latitude"], j["longitude"]))
    listings.append(j["listing_id"])
    
results = rg.search(lat_lon) 
nbd = [[listings[i], results[i]['name']] for i in range(0, len(results))]
nbd = pd.DataFrame(data=nbd,columns=["listing_id","neighborhood"])
train_df = pd.merge(train_df, nbd, on='listing_id', how='left')



In [66]:
lat_lon = []
listings = []
for i, j in test_df.iterrows():
    lat_lon.append((j["latitude"], j["longitude"]))
    listings.append(j["listing_id"])
    
results = rg.search(lat_lon) 
nbd_test = [[listings[i], results[i]['name']] for i in range(0, len(results))]
nbd_test = pd.DataFrame(data=nbd_test,columns=["listing_id","neighborhood"])
test_df = pd.merge(test_df, nbd_test, on='listing_id', how='left')

In [None]:
features_to_use.append("neighborhood")

In [65]:
test_df.drop('neighborhood', axis=1, inplace=True)

In [56]:
train_df["neighborhood"].dtype=='object'

True

In [67]:
categorical = ["neighborhood"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))

In [71]:
# Add the number of exclamation signs:
train_df['num_exclamations'] = train_df['description'].apply(lambda x: len(x.split('!')))
test_df['num_exclamations'] = test_df['description'].apply(lambda x: len(x.split('!')))
features_to_use.append("num_exclamations")

In [30]:
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df2["features"])
te_sparse = tfidf.transform(test_df2["features"])

In [125]:
features_to_use=['bathrooms',
 'bedrooms',
 'latitude',
 'longitude',
 'price',
 'price_t',
 'price_per_room',
 'logprice',
 'density',
 'half_bathrooms',
 'num_photos',
 'num_features',
 'num_description_words',
 'listing_id',
 'created_year',
 'created_month',
 'created_day',
 'created_hour',
 'img_days_passed',
 'img_date_month',
 'img_date_week',
 'img_date_day',
 'img_date_dayofweek',
 'img_date_dayofyear',
 'img_date_hour',
 'img_date_monthBeginMidEnd',
 'bed_bath',
 'bed_price',
 'bath_price',
 'bed_photo',
 'price_photo',
 'price_feat',
 'bed_feat',
 'bath_feat',
 'bath_desc',
 'price_desc',
 'total_days',
 'diff_rank',
 'num_rho',
 'num_phi',
 'num_rot15_X',
 'num_rot15_Y',
 'num_rot30_X',
 'num_rot30_Y',
 'num_rot45_X',
 'num_rot45_Y',
 'num_rot60_X',
 'num_rot60_Y',
 'num_cap_share',
 'num_nr_of_lines',
 'num_redacted',
 'num_email',
 'num_phone_nr',
 'manager_level_low',
 'manager_level_medium',
 'manager_level_high',
 'street_address',
 'display_address',
 'manager_id',
 'building_id',
 'compound',
 'neu',
 'neg',
 'distance_manhatten_loc',
 'distance_queens_loc',
 'distance_bronx_loc',
 'distance_brooklyn_loc',
 'distance_staten_loc',
 'cats',
 'concierge',
 'dogs',
 'elevator',
 'health',
 'laundry',
 'lowfee',
 'nofee',
 'parking',
 'parquet',
 'prewar',
 'transport',
 'utilities']



In [137]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

In [138]:
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [136]:
features_to_use=['bathrooms',
 'bedrooms',
 'latitude',
 'longitude',
 'price',
 'price_t',
 'price_per_room',
 'logprice',
 'density',
 'half_bathrooms',
 'num_photos',
 'num_features',
 'num_description_words',
 'listing_id',
 'created_year',
 'created_month',
 'created_day',
 'created_hour',
 'img_days_passed',
 'img_date_month',
 'img_date_week',
 'img_date_day',
 'img_date_dayofweek',
 'img_date_dayofyear',
 'img_date_hour',
 'img_date_monthBeginMidEnd',
 'bed_bath',
 'bed_price',
 'bath_price',
 'bed_photo',
 'price_photo',
 'price_feat',
 'bed_feat',
 'bath_feat',
 'bath_desc',
 'price_desc',
 'total_days',
 'diff_rank',
 'num_rho',
 'num_phi',
 'num_rot15_X',
 'num_rot15_Y',
 'num_rot30_X',
 'num_rot30_Y',
 'num_rot45_X',
 'num_rot45_Y',
 'num_rot60_X',
 'num_rot60_Y',
 'num_cap_share',
 'num_nr_of_lines',
 'num_redacted',
 'num_email',
 'num_phone_nr',
 'manager_level_low',
 'manager_level_medium',
 'manager_level_high',
 'street_address',
 'display_address',
 'manager_id',
 'building_id',
 'compound',
 'neu',
 'neg',
 'distance_manhatten_loc',
 'distance_queens_loc',
 'distance_bronx_loc',
 'distance_brooklyn_loc',
 'distance_staten_loc',]
 #'cats',
 #'concierge',
 #'dogs',
 #'elevator',
 #'health',
 #'laundry',
 #'lowfee',
 #'nofee',
 #'parking',
 #'parquet',
 #'prewar',
 #'transport',
 #'utilities']

In [139]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.08384	test-mlogloss:1.08418
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.06959	test-mlogloss:1.07033
[2]	train-mlogloss:1.05575	test-mlogloss:1.05684
[3]	train-mlogloss:1.04243	test-mlogloss:1.04389
[4]	train-mlogloss:1.02949	test-mlogloss:1.03131
[5]	train-mlogloss:1.01704	test-mlogloss:1.01923
[6]	train-mlogloss:1.00501	test-mlogloss:1.00747
[7]	train-mlogloss:0.993372	test-mlogloss:0.996174
[8]	train-mlogloss:0.98199	test-mlogloss:0.985159
[9]	train-mlogloss:0.971038	test-mlogloss:0.974522
[10]	train-mlogloss:0.960377	test-mlogloss:0.96422
[11]	train-mlogloss:0.950209	test-mlogloss:0.95436
[12]	train-mlogloss:0.940301	test-mlogloss:0.9448
[13]	train-mlogloss:0.93062	test-mlogloss:0.935405
[14]	train-mlogloss:0.921194	test-mlogloss:0.92624
[15]	train-mlogloss:0.911935	test-mlogloss:0.917366
[16]	train-mlogloss:0.902925	test-mlogloss:0.908

In [140]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1800)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
#out_df["listing_id"]=out_df["listing_id"]+68119576.0
out_df.to_csv("best_single_model_sent_nofeatdistance_new_0425.csv", index=False)