In [1]:
#import packages
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
#upload train data
train_df = pd.read_json("../input/train.json")
print("This is train data:")
train_df.head()

This is train data:


Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [3]:
#write the predicted result in a csv file

train_df.to_csv("traindata.csv", index=False)

In [4]:
#to find if there's null value in the dataset
print("No null values in the train data")
train_df.isnull().sum()

No null values in the train data


bathrooms          0
bedrooms           0
building_id        0
created            0
description        0
display_address    0
features           0
interest_level     0
latitude           0
listing_id         0
longitude          0
manager_id         0
photos             0
price              0
street_address     0
dtype: int64

In [5]:
#data cleaning
#basic features
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

#split the time data into year, month, day and hour
train_df["created"] = pd.to_datetime(train_df["created"])

train_df["created_year"] = train_df["created"].dt.year

train_df["created_month"] = train_df["created"].dt.month

train_df["created_day"] = train_df["created"].dt.day

train_df["created_hour"] = train_df["created"].dt.hour

features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]
features_to_use.extend(["num_photos", "num_features", "num_description_words",
                        "created_year", "created_month", "created_day", "listing_id", "created_hour"])

#deal with the address data and id data
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()#Encode labels with value between 0 and n_classes-1.
            lbl.fit(list(train_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            features_to_use.append(f)


In [6]:
#Define X and y
X = train_df[features_to_use]
y = train_df["interest_level"]
X.head()


Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,num_photos,num_features,num_description_words,created_year,created_month,created_day,listing_id,created_hour,display_address,manager_id,building_id,street_address
10,1.5,3,40.7145,-73.9425,3000,5,0,95,2016,6,24,7211212,7,6544,1239,2431,14074
10000,1.0,2,40.7947,-73.9667,5465,11,5,9,2016,6,12,7150865,12,4506,1583,5862,14195
100004,1.0,1,40.7388,-74.0018,2850,8,4,94,2016,4,17,6887163,3,7387,2965,5806,5876
100007,1.0,1,40.7539,-73.9677,3275,3,2,80,2016,4,18,6888711,2,5703,225,1201,8574
100013,1.0,4,40.8241,-73.9493,3350,3,1,68,2016,4,28,6934781,1,8271,2081,0,11554


In [7]:
#split train data into two parts
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123,stratify=y)

In [8]:
#build baseline model:LogisticRegression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_val_pred1 = classifier.predict_proba(X_val)
#calculate the coefficient of all the features
classifier.coef_
m=pd.DataFrame(classifier.coef_)
print("These are all the coefficient of all features")
m.head()

These are all the coefficient of all features


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.080428e-08,3.430188e-07,6.303506e-08,-1.156729e-07,-0.000554,3.295821e-07,2.406693e-07,7e-06,2e-06,1.52047e-08,-1.23869e-06,-4.781419e-08,2.435864e-06,-0.000119,-5.7e-05,8.8e-05,6e-06
1,-3.050641e-08,-4.155728e-07,-7.710697e-08,1.441497e-07,0.000355,-4.825485e-07,-1.06778e-06,-1.7e-05,-3e-06,1.164875e-08,7.607464e-07,-4.410861e-08,-1.781377e-06,7.5e-05,2e-06,-0.000136,-5e-06
2,7.193171e-09,2.32001e-07,3.096738e-08,-5.901894e-08,-0.000226,2.947356e-07,7.988754e-07,1.2e-05,1e-06,1.881615e-09,-2.757795e-07,-1.05587e-07,8.655298e-07,-3.5e-05,1.5e-05,0.00012,7e-06


In [9]:
#calculate the log_loss value for all the inputting features
print("Calculate the log_loss value for all the inputting features:")
print("test log_loss value for all the inputting features:",log_loss(y_val, y_val_pred1))
y_train_pred1 = classifier.predict_proba(X_train)
print("train log_loss value for all the inputting features:",log_loss(y_train, y_train_pred1))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X, y, cv=5) 
print("accuracy Scores for all the inputting features:", scores, "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Calculate the log_loss value for all the inputting features:
test log_loss value for all the inputting features: 0.73989938182


  np.exp(prob, prob)


train log_loss value for all the inputting features: 0.741926065087
accuracy Scores for all the inputting features: [ 0.69476244  0.6957755   0.69668727  0.69466113  0.69466964] Accuracy: 0.70 (+/- 0.00)


In [10]:
#Experiment1:
#choose 4,7,8,13,14,15,16 as independent variables
print("Experiment1: choose 4,7,8,13,14,15,16 as independent vairables")
X_ef=X.iloc[:,[4,7,8,13,14,15,16]]
classifier_xef = LogisticRegression()
classifier_xef.fit(X_train.iloc[:,[4,7,8,13,14,15,16]], y_train)
y_val_pred_xef = classifier_xef.predict_proba(X_val.iloc[:,[4,7,8,13,14,15,16]])
print("test log_loss value for 7 inputting features:",log_loss(y_val,y_val_pred_xef))
y_train_pred_xef_t = classifier_xef.predict_proba(X_train.iloc[:,[4,7,8,13,14,15,16]])
print("train log_loss value for 7 inputting features:",log_loss(y_train, y_train_pred_xef_t))
from sklearn.model_selection import cross_val_score
scores_xef = cross_val_score(classifier_xef, X.iloc[:,[4,7,8,13,14,15,16]], y, cv=5) 
print("accuracy Scores for 7 inputting features:", scores_xef, "Accuracy: %0.2f (+/- %0.2f)" % (scores_xef.mean(), scores_xef.std() * 2))

Experiment1: choose 4,7,8,13,14,15,16 as independent vairables
test log_loss value for 7 inputting features: 0.735695888506


  np.exp(prob, prob)


train log_loss value for 7 inputting features: 0.73622832811
accuracy Scores for 7 inputting features: [ 0.6912167   0.69547158  0.69557289  0.69314153  0.69365626] Accuracy: 0.69 (+/- 0.00)


In [11]:
#Experiment2:
#choose 1,4,5,6,7,8,10,12,13,14,15,16 as independent variables
print("Experiment2: choose 1,4,5,6,7,8,10,12,13,14,15,16 as independent vairables")
X_ef2=X.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]]
classifier_xef2 = LogisticRegression()
classifier_xef2.fit(X_train.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]], y_train)
y_val_pred_xef2 = classifier_xef2.predict_proba(X_val.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]])
print("test log_loss value for 12 inputting features:",log_loss(y_val,y_val_pred_xef2))
y_train_pred_xef_t2 = classifier_xef2.predict_proba(X_train.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]])
print("train log_loss value for 12 inputting features:",log_loss(y_train, y_train_pred_xef_t2))
from sklearn.model_selection import cross_val_score
scores_xef2 = cross_val_score(classifier_xef2, X.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]], y, cv=5) 
print("accuracy Scores for 12 inputting features:", scores_xef2, "Accuracy: %0.2f (+/- %0.2f)" % (scores_xef2.mean(), scores_xef2.std() * 2))

Experiment2: choose 1,4,5,6,7,8,10,12,13,14,15,16 as independent vairables
test log_loss value for 12 inputting features: 0.698265356954


  np.exp(prob, prob)


train log_loss value for 12 inputting features: 0.699332138113
accuracy Scores for 12 inputting features: [ 0.6912167   0.69628204  0.69638335  0.69496505  0.69669639] Accuracy: 0.70 (+/- 0.00)


In [12]:
#define 12 independent variables in the data to train the models
X_train_12v=X_train.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]]
X_val_12v=X_val.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]]

In [14]:
#model1:GradientBosstingClassifier model
print("Train models:")
print("model1:GradientBosstingClassifier model")
#train GradientBoostingClassifier model
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=420,max_depth=4,subsample=0.7)
clf.fit(X_train_12v, y_train)
#use split test data to predict
y_val_pred2 = clf.predict_proba(X_val_12v)
#calculate the accuracy of test data by using log_loss
print("test log_loss value:",log_loss(y_val, y_val_pred2))
y_train_pred2 = clf.predict_proba(X_train_12v)
print("train log_loss value:",log_loss(y_train, y_train_pred2))
from sklearn.model_selection import cross_val_score
scores1 = cross_val_score(clf, X.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]], y, cv=5) 
print("accuracy Scores:", scores1, "Accuracy: %0.2f (+/- %0.2f)" % (scores1.mean(), scores1.std() * 2))
    

Train models:
model1:GradientBosstingClassifier model
test log_loss value: 0.599552414203
train log_loss value: 0.482967695319
accuracy Scores: [ 0.73072637  0.73305643  0.7340695   0.73569041  0.73175922] Accuracy: 0.73 (+/- 0.00)


In [None]:
#model2: VotingClassifier model
#train VotingClassifier model
print("model2: VotingClassifier model")
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft',weights=[6,5,1])
eclf.fit(X_train_12v, y_train)
#use split test data to predict
y_val_pred3 = eclf.predict_proba(X_val_12v)
#calculate the accuracy of test data by using log_loss
print("test log_loss value:",log_loss(y_val, y_val_pred3))
#calculate the accuracy of train data by using log_loss
y_train_pred3 = eclf.predict_proba(X_train_12v)
print("train log_loss value:",log_loss(y_train, y_train_pred3))
#calculate accuracy of train data by using cross_val_score
from sklearn.model_selection import cross_val_score
scores2 = cross_val_score(eclf, X.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]], y, cv=5) 
print("accuracy Scores:", scores2, "Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))

In [None]:
#model3: RandomForestClassifier model
#train RandomForestClassifier model
print("model3: RandomForestClassifier model")
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf_ran = RandomForestClassifier(n_estimators=1000)
clf_ran.fit(X_train_12v, y_train)
#use split test data to predict
y_val_pred4=clf_ran.predict_proba(X_val_12v)
#calculate the accuracy of test data by using log_loss
print("test log_loss value:",log_loss(y_val, y_val_pred4))
#calculate the accuracy of train data by using log_loss
y_train_pred4 = clf_ran.predict_proba(X_train_12v)
print("train log_loss value:",log_loss(y_train, y_train_pred4))
#calculate accuracy of train data by using cross_val_score
from sklearn.model_selection import cross_val_score
scores3 = cross_val_score(clf_ran, X.iloc[:,[1,4,5,6,7,8,10,12,13,14,15,16]], y, cv=5) 
print("accuracy Scores:", scores3, "Accuracy: %0.2f (+/- %0.2f)" % (scores3.mean(), scores3.std() * 2))


In [None]:
#upload test data
test_df = pd.read_json("../input/test.json")
print("This is test data")
test_df.head()

In [None]:
test_df.to_csv("testdata.csv", index=False)

In [None]:
#to see if there's null value in the test data
print("No null values in test data")
test_df.isnull().sum()

In [None]:
#clean test data
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 
test_df["num_photos"] = test_df["photos"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))
test_df["created"] = pd.to_datetime(test_df["created"])
test_df["created_year"] = test_df["created"].dt.year
test_df["created_month"] = test_df["created"].dt.month
test_df["created_day"] = test_df["created"].dt.day
test_df["created_hour"] = test_df["created"].dt.hour
#only choose the 12 independent variables:
f1=["bedrooms", "price","num_photos", "num_features", "num_description_words",
                        "created_year", "created_day", "created_hour"]
#deal with address data and id data in the test dataset
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if test_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit( list(test_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            f1.append(f)

In [None]:
#use real test data to do LogisticRegression model prediction
X_logisticr = test_df[f1]
y_logisticr = classifier_xef2.predict_proba(X_logisticr)
y_logisticr.shape

In [None]:
#enumerate all the interest level
labels2idx = {label: i for i, label in enumerate(classifier.classes_)}
labels2idx

In [None]:
#write the predicted result in a csv file
sub_logisticr = pd.DataFrame(y_logisticr)
sub_logisticr["listing_id"] = test_df.listing_id.values
for label in ["high", "medium", "low"]:
    sub_logisticr[label] = y_logisticr[:, labels2idx[label]]
sub_logisticr.iloc[:,3:7].to_csv("submission_logisticr.csv", index=False)

In [None]:
#use real test data to do GradientBoostingClassifier model prediction
X_GBC = test_df[f1]
y_GBC = clf.predict_proba(X_GBC)
y_GBC.shape

In [None]:
#enumerate all the interest level
labels3idx = {label: i for i, label in enumerate(clf.classes_)}
labels3idx

In [None]:
#write the predicted result in a csv file
sub_GBC = pd.DataFrame(y_GBC)
sub_GBC["listing_id"] = test_df.listing_id.values
for label in ["high", "medium", "low"]:
    sub_GBC[label] = y_GBC[:, labels3idx[label]]
sub_GBC.iloc[:,3:7].to_csv("submission_GBC.csv", index=False)

In [None]:
#use real test data to do VotingClassifier model prediction
X_VC = test_df[f1]
y_VC = eclf.predict_proba(X_VC)
y_VC.shape

In [None]:
#enumerate all the interest level
labels4idx = {label: i for i, label in enumerate(eclf.classes_)}
labels4idx

In [None]:
#write the predicted result in a csv file
sub_VC = pd.DataFrame(y_VC)
sub_VC["listing_id"] = test_df.listing_id.values
for label in ["high", "medium", "low"]:
    sub_VC[label] = y_VC[:, labels4idx[label]]
sub_VC.iloc[:,3:7].to_csv("submission_VC.csv", index=False)

In [None]:
#use real test data to do VotingClassifier model prediction
X_RAN = test_df[f1]
y_RAN = clf_ran.predict_proba(X_RAN)
y_RAN.shape

In [None]:
#enumerate all the interest level
labels5idx = {label: i for i, label in enumerate(clf_ran.classes_)}
labels5idx

In [None]:
#write the predicted result in a csv file
sub_RAN = pd.DataFrame(y_RAN)
sub_RAN["listing_id"] = test_df.listing_id.values
for label in ["high", "medium", "low"]:
    sub_RAN[label] = y_RAN[:, labels5idx[label]]
sub_RAN.iloc[:,3:7].to_csv("submission_RAN.csv", index=False)