In [1]:
import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
import random

In [2]:
data=pd.read_json("train.json")

In [3]:
data.columns

Index([u'bathrooms', u'bedrooms', u'building_id', u'created', u'description',
       u'display_address', u'features', u'interest_level', u'latitude',
       u'listing_id', u'longitude', u'manager_id', u'photos', u'price',
       u'street_address'],
      dtype='object')

In [4]:
random.seed(0)
train_data=data.sample(n=data.shape[0]*7/10)
test_data=data.drop(train_data.index)

In [5]:
%matplotlib inline

In [132]:
##########################first try with all numeric features in datasets##########################################
train=train_data.loc[:,['bathrooms','bedrooms','latitude','longitude','price','interest_level']]
test=test_data.loc[:,['bathrooms','bedrooms','latitude','longitude','price','interest_level']]

In [7]:
#base model 1: multinomial logistic regression
#base model 2: bagged decision trees
#base model 3: Random Forest trees
#base model 4: SVM
#base model 5: bayes classifier?
x_train=train.loc[:,['bathrooms','bedrooms','latitude','longitude','price']]
y_train=train.loc[:,'interest_level']
x_test=test.loc[:,['bathrooms','bedrooms','latitude','longitude','price']]
y_test=test.loc[:,'interest_level']

In [8]:
#base model 1: multinomial logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
def mlog(x_train,y_train,x_test):
    lr = LogisticRegression().fit(x_train, y_train)
    return lr.predict(x_test)

In [11]:
###accuracy on total train and test
accuracy_score(mlog(x_train,y_train,x_test),test.loc[:,'interest_level'])

0.6960691611508848

In [49]:
#base model 2: bagged decision trees
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
def bagDT(x_train,y_train,x_test,y_test):
    kfold = model_selection.KFold(n_splits=10)
    cart = DecisionTreeClassifier()
    num_trees = 100
    model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
    model.fit(x_train,y_train)
    predict = model_selection.cross_val_predict(model, x_test, y_test, cv=kfold)
    return predict

In [50]:
bagDT(x_train,y_train,x_test,y_test)

array([u'low', u'low', u'high', ..., u'high', u'low', u'low'], dtype=object)

In [13]:
#base model 3: Random Forest trees
from sklearn.ensemble import RandomForestClassifier
seed = 7
def rfClassifier(x_train,y_train,x_test,y_test):
    num_trees = 100
    max_features = 3
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
    model.fit(x_train,y_train)
    predicted = model_selection.cross_val_predict(model, x_test, y_test, cv=kfold)
    return predicted

In [14]:
#base model 4: SVM
#use cross validation to select cost, cost(default)=1
from sklearn import svm
def svmm(x_train,y_train,x_test):
    clf = svm.SVC(decision_function_shape='ovr')
    clf.fit(x_train, y_train) 
    return clf.predict(x_test)

In [None]:
#accuracy_score(svmm(x_train,y_train,x_test),test_data.loc[:,'interest_level'])

In [63]:
#base model 5: Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
def mnb(x_train,y_train,x_test):
    gnb=GaussianNB()
    y_pred = gnb.fit(x_train, y_train).predict(x_test)
    return y_pred

In [55]:
###############################now we ensemble them ###########################################################
#1st: partition the train set into 5 test sets
k=x_train.shape[0]/5
x_sp=[[],[],[],[],[]]
for i in range(4):
    sample=random.sample(x_train.index,k)
    x_sp[i]=x_train.ix[sample]
    x_train=x_train.drop(sample)
x_sp[4]=x_train

In [68]:
#2nd: create train_meta and test_meta

train_meta=pd.DataFrame()
train_meta

In [69]:
#3rd: for each fold in 1st, use other 4 folds as training set to predict the result for that fold.
#and save them in train_meta
x_train=train.loc[:,['bathrooms','bedrooms','latitude','longitude','price']]
for i in range(5):
    x_sub_test=x_sp[i]
    x_sub_train=x_train.drop(x_sub_test.index)
    y_sub_test=y_train[x_sub_test.index]
    y_sub_train=y_train[x_sub_train.index]
    M1=pd.Series(mlog(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    M2=pd.Series(bagDT(x_sub_train,y_sub_train,x_sub_test,y_sub_test),index=x_sub_test.index)
    M3=pd.Series(rfClassifier(x_sub_train,y_sub_train,x_sub_test,y_sub_test),index=x_sub_test.index)
    M4=pd.Series(svmm(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    M5=pd.Series(mnb(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    app={'M1':M1,'M2':M2,'M3':M3, 'M4':M4, 'M5':M5}
    train_meta=train_meta.append(pd.DataFrame(app))

In [71]:
#4th:Fit each base model to the full training dataset 
#and make predictions on the test dataset. Store these predictions inside test_meta
M1=pd.Series(mlog(x_train,y_train,x_test),index=x_test.index)
M2=pd.Series(bagDT(x_train,y_train,x_test,y_test),index=x_test.index)
M3=pd.Series(rfClassifier(x_train,y_train,x_test,y_test),index=x_test.index)
M4=pd.Series(svmm(x_train,y_train,x_test),index=x_test.index)
M5=pd.Series(mnb(x_train,y_train,x_test),index=x_test.index)
test['M1']=M1
test['M2']=M2
test['M3']=M3
test['M4']=M4
test['M5']=M5


In [99]:
#5th: Fit a new model, S (i.e the stacking model) to train_meta, using M1 and M2 as features.
#Optionally, include other features from the original training dataset or engineered features
##==> transfer to dummy variables
train_meta_dummy=pd.get_dummies(train_meta)
test_meta=test.loc[:,['M1','M2','M3','M4','M5']]
test_meta_dummy=pd.get_dummies(test_meta)

In [101]:
###using only meta data to test##########
accuracy_score(mlog(train_meta_dummy,y_train,test_meta_dummy),y_test)
buffer1=pd.Series(mlog(train_meta_dummy,y_train,test_meta_dummy))
buffer1.value_counts()

0.69714980413345939

In [127]:
#random forest
res=rfClassifier(train_meta_dummy,y_train,test_meta_dummy,y_test)
print accuracy_score(res,y_test)
pd.Series(res).value_counts()

0.705727407808


low       13793
medium      805
high        208
dtype: int64

In [125]:
####concate meta to original dataset and train it again########

In [142]:
x_last_train=pd.concat([train_meta_dummy,x_train],axis=1)
x_last_test=pd.concat([test_meta_dummy,x_test],axis=1)

In [143]:
#logistic
buffer2=pd.Series(mlog(x_last_train,y_train,x_last_test))
print accuracy_score(buffer1,y_test)
buffer2.value_counts()

0.697149804133


low    14806
dtype: int64

In [144]:
#random forest
res=rfClassifier(x_last_train,y_train,x_last_test,y_test)
print accuracy_score(res,y_test)
pd.Series(res).value_counts()

0.724571119816


low       11990
medium     2208
high        608
dtype: int64