In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.cross_validation as CV
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.feature_extraction import DictVectorizer as DV
from datetime import date
import itertools

%matplotlib inline

In [2]:
def myScore(y, y_pred):
    res = np.ndarray([1,2]);
    u = (np.power((y-y_pred),2)).sum();
    v = (np.power((y-y.mean()),2)).sum();
    res[0,0] = 1-(u/v);

    
    yLog = np.log(y);
    ind = np.isinf(yLog);
    yLog[ind == True] = 0; 
    y_predLog = np.log(y_pred);
    ind = np.isinf(y_predLog);
    y_predLog[ind == True] = 0;   
    summ = np.sum(np.power(yLog-y_predLog,2))
    res[0,1] = np.sqrt(summ/y.shape[0]);

    return res

# replace qualitative estimation to number
def replaceQualVal(dataSet,fNameList):
    # dictionary of values
    vocab = {
        'Ex': 5, 'EX': 5, # excellent    
        'Gd': 4, 'GD': 4, # good
        'TA': 3, 'Ta': 3, # normal
        'FA': 2, 'Fa': 2, # fair
        'PO': 1, 'Po': 1  # 
        }
    
    for fName in fNameList:
        
        # replace stings to numbers
        for word in vocab:
            searchDict = {fName:[word]};
            X = dataSet.isin(searchDict);
            dataSet.loc[X[fName],fName] = vocab[word];
        
        # convert to numeric type
        dataSet[[fName]] = dataSet[[fName]].apply(pd.to_numeric);
    return dataSet;

# replace YN estimation to number
def replaceYNVal(dataSet,fNameList):
    # dictionary of values
    vocab = {
        'Yes': 1, 'Y': 5, # yes
        'No': 1, 'N': 1  # no
        }
    
    for fName in fNameList:
        
        # replace stings to numbers
        for word in vocab:
            searchDict = {fName:[word]};
            X = dataSet.isin(searchDict);
            dataSet.loc[X[fName],fName] = vocab[word];
        
        # convert to numeric type
        dataSet[[fName]] = dataSet[[fName]].apply(pd.to_numeric);
    return dataSet;

# main fit function
def fitData(folds,regressor,features):
    
    
    num_features = features.select_dtypes(exclude=['object']);
    num_features.fillna(0,inplace=True);
    
    obj_features = features.select_dtypes(include=['object']);
    obj_features.fillna('empty',inplace=True)
    
    encoder = DV(sparse = False);
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
    
    newFeatures = np.hstack([num_features, encoded_data]);

    score = np.empty([1,2]);
    
    for [trainInds, testInds] in folds:
        regressor.fit(newFeatures[trainInds,:],price[trainInds]);
        y_pr = regressor.predict(newFeatures[testInds,:]);
        pr = myScore(price[testInds],y_pr);
        print pr
        score =  np.append(score,pr,axis = 0);
    
    score = np.delete(score,0,0);
    return score

In [3]:
data = pd.read_csv('train.csv',index_col='Id')
features = data.drop('SalePrice',axis = 1)
price = data.SalePrice;

In [4]:
# define regressor
trees = GBR(verbose = 0, n_estimators = 1000, max_depth = 3);

# define cross-validation folds
nFolds = 10;
folds = CV.KFold(price.size, n_folds=nFolds, random_state = 43);

In [16]:
features = data.drop('SalePrice',axis = 1)
price = data['SalePrice'].get_values()


#features["OverallQualLog"]=np.log(features["OverallQual"])
#features.drop(['OverallQual'],axis = 1,inplace = True);

# drop heating type
#features.drop('Heating',axis = 1,inplace = True);
#features.drop(['MoSold', 'YrSold'],axis = 1,inplace = True);
features = replaceYNVal(features,{'CentralAir'});

# process GarageYrBlt
empty = pd.isnull(features['GarageYrBlt']);
ind = empty[empty == True].index;
features.loc[ind.values,'GarageYrBlt'] = features.loc[ind.values,'YearBuilt'];

# add new feature
features.loc[:,'houseAgeLog'] = np.log(date.today().year - features.loc[:,'YearBuilt']);
features.loc[:,'garageAgeLog'] = np.log(date.today().year - features.loc[:,'GarageYrBlt']);
features.loc[:,'remodeAge'] = features.loc[:,'YearRemodAdd'] - features.loc[:,'YearBuilt'];


featureList = {'ExterQual','ExterCond','BsmtQual','BsmtCond','PoolQC','HeatingQC','KitchenQual','GarageQual',
               'GarageCond','FireplaceQu','PoolQC'};
res = list();

for fNames in itertools.combinations(featureList, 1):
    print fNames
    newFeatures = replaceQualVal(features,fNames);
    
    num_features = newFeatures.select_dtypes(exclude=['object']);
    num_features.fillna(0,inplace=True);
    
    obj_features = newFeatures.select_dtypes(include=['object']);
    obj_features.fillna('empty',inplace=True)
    encoder = DV(sparse = False);
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
    arrFeatures = np.hstack([num_features, encoded_data]);
       
    y_pr = CV.cross_val_predict(trees, arrFeatures, y=price, cv=folds, n_jobs=16, verbose=0)
    sc = myScore(price, y_pr);
    res.append([fName,sc]);
    print sc
    
for fNames in itertools.combinations(featureList, 2):
    print fNames
    newFeatures = replaceQualVal(features,fNames);
    
    num_features = newFeatures.select_dtypes(exclude=['object']);
    num_features.fillna(0,inplace=True);
    
    obj_features = newFeatures.select_dtypes(include=['object']);
    obj_features.fillna('empty',inplace=True)
    encoder = DV(sparse = False);
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
    arrFeatures = np.hstack([num_features, encoded_data]);
       
    y_pr = CV.cross_val_predict(trees, arrFeatures, y=price, cv=folds, n_jobs=8, verbose=0)
    sc = myScore(price, y_pr);
    res.append([fName,sc]);
    print sc
    
for fNames in itertools.combinations(featureList, 3):
    print fNames
    newFeatures = replaceQualVal(features,fNames);
    
    num_features = newFeatures.select_dtypes(exclude=['object']);
    num_features.fillna(0,inplace=True);
    
    obj_features = newFeatures.select_dtypes(include=['object']);
    obj_features.fillna('empty',inplace=True)
    encoder = DV(sparse = False);
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
    arrFeatures = np.hstack([num_features, encoded_data]);
       
    y_pr = CV.cross_val_predict(trees, arrFeatures, y=price, cv=folds, n_jobs=8, verbose=0)
    sc = myScore(price, y_pr);
    res.append([fName,sc]);
    print sc
    
for fNames in itertools.combinations(featureList, 4):
    print fNames
    newFeatures = replaceQualVal(features,fNames);
    
    num_features = newFeatures.select_dtypes(exclude=['object']);
    num_features.fillna(0,inplace=True);
    
    obj_features = newFeatures.select_dtypes(include=['object']);
    obj_features.fillna('empty',inplace=True)
    encoder = DV(sparse = False);
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
    arrFeatures = np.hstack([num_features, encoded_data]);
       
    y_pr = CV.cross_val_predict(trees, arrFeatures, y=price, cv=folds, n_jobs=8, verbose=0)
    sc = myScore(price, y_pr);
    res.append([fName,sc]);
    print sc
    
for fNames in itertools.combinations(featureList, 5):
    print fNames
    newFeatures = replaceQualVal(features,fNames);
    
    num_features = newFeatures.select_dtypes(exclude=['object']);
    num_features.fillna(0,inplace=True);
    
    obj_features = newFeatures.select_dtypes(include=['object']);
    obj_features.fillna('empty',inplace=True)
    encoder = DV(sparse = False);
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
    arrFeatures = np.hstack([num_features, encoded_data]);
       
    y_pr = CV.cross_val_predict(trees, arrFeatures, y=price, cv=folds, n_jobs=8, verbose=0)
    sc = myScore(price, y_pr);
    res.append([fName,sc]);
    print sc    

('GarageCond',)
[[ 0.89742668  0.12401376]]
('BsmtCond',)
[[ 0.89815044  0.1232306 ]]
('HeatingQC',)
[[ 0.89734659  0.12408832]]
('GarageQual',)
[[ 0.89677224  0.12316641]]
('ExterCond',)
[[ 0.89452388  0.12473895]]
('ExterQual',)
[[ 0.90062564  0.12319583]]
('FireplaceQu',)
[[ 0.89517397  0.12426535]]
('KitchenQual',)
[[ 0.89797255  0.12464263]]
('BsmtQual',)
[[ 0.89624575  0.12398415]]
('PoolQC',)
[[ 0.89783051  0.12394848]]
('GarageCond', 'BsmtCond')
[[ 0.89786375  0.1237549 ]]
('GarageCond', 'HeatingQC')
[[ 0.89516854  0.12451854]]
('GarageCond', 'GarageQual')
[[ 0.89755265  0.12425586]]
('GarageCond', 'ExterCond')
[[ 0.89846682  0.12326589]]
('GarageCond', 'ExterQual')
[[ 0.89618775  0.12438076]]
('GarageCond', 'FireplaceQu')
[[ 0.89759292  0.12413761]]
('GarageCond', 'KitchenQual')
[[ 0.89576183  0.12398255]]
('GarageCond', 'BsmtQual')
[[ 0.89817232  0.12387093]]
('GarageCond', 'PoolQC')
[[ 0.89783313  0.12420547]]
('BsmtCond', 'HeatingQC')
[[ 0.89909881  0.12350619]]
('BsmtCond'

In [10]:
features = data.drop('SalePrice',axis = 1)
price = data['SalePrice'].get_values()


#features["OverallQualLog"]=np.log(features["OverallQual"])
#features.drop(['OverallQual'],axis = 1,inplace = True);

# drop heating type
#features.drop('Heating',axis = 1,inplace = True);
#features.drop(['MoSold', 'YrSold'],axis = 1,inplace = True);
features = replaceYNVal(features,{'CentralAir'});

# process GarageYrBlt
empty = pd.isnull(features['GarageYrBlt']);
ind = empty[empty == True].index;
features.loc[ind.values,'GarageYrBlt'] = features.loc[ind.values,'YearBuilt'];

# add new feature
features.loc[:,'houseAgeLog'] = np.log(date.today().year - features.loc[:,'YearBuilt']);
features.loc[:,'garageAgeLog'] = np.log(date.today().year - features.loc[:,'GarageYrBlt']);
features.loc[:,'remodeAge'] = features.loc[:,'YearRemodAdd'] - features.loc[:,'YearBuilt'];
newFeatures = features;

num_features = newFeatures.select_dtypes(exclude=['object']);
num_features.fillna(0,inplace=True);
    
obj_features = newFeatures.select_dtypes(include=['object']);
obj_features.fillna('empty',inplace=True)
encoder = DV(sparse = False);
encoded_data = encoder.fit_transform(obj_features.T.to_dict().values());
arrFeatures = np.hstack([num_features, encoded_data]);
       
y_pr = CV.cross_val_predict(trees, arrFeatures, y=price, cv=folds, n_jobs=8, verbose=1)
print myScore(price, y_pr)

[[ 0.90359605  0.12205031]]


[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:   30.8s finished


In [17]:
res

[['BsmtCond', array([[ 0.89742668,  0.12401376]])],
 ['BsmtCond', array([[ 0.89815044,  0.1232306 ]])],
 ['BsmtCond', array([[ 0.89734659,  0.12408832]])],
 ['BsmtCond', array([[ 0.89677224,  0.12316641]])],
 ['BsmtCond', array([[ 0.89452388,  0.12473895]])],
 ['BsmtCond', array([[ 0.90062564,  0.12319583]])],
 ['BsmtCond', array([[ 0.89517397,  0.12426535]])],
 ['BsmtCond', array([[ 0.89797255,  0.12464263]])],
 ['BsmtCond', array([[ 0.89624575,  0.12398415]])],
 ['BsmtCond', array([[ 0.89783051,  0.12394848]])],
 ['BsmtCond', array([[ 0.89786375,  0.1237549 ]])],
 ['BsmtCond', array([[ 0.89516854,  0.12451854]])],
 ['BsmtCond', array([[ 0.89755265,  0.12425586]])],
 ['BsmtCond', array([[ 0.89846682,  0.12326589]])],
 ['BsmtCond', array([[ 0.89618775,  0.12438076]])],
 ['BsmtCond', array([[ 0.89759292,  0.12413761]])],
 ['BsmtCond', array([[ 0.89576183,  0.12398255]])],
 ['BsmtCond', array([[ 0.89817232,  0.12387093]])],
 ['BsmtCond', array([[ 0.89783313,  0.12420547]])],
 ['BsmtCond'