In [1]:
import graphlab

In [2]:
sales = graphlab.SFrame('kc_house_data.gl/')

2016-04-04 13:11:33,792 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: C:\Users\dato\AppData\Local\Temp\graphlab_server_1459743091.log.0


This non-commercial license of GraphLab Create is assigned to dohk@koreatech.ac.kr and will expire on January 17, 2017. For commercial licensing options, visit https://dato.com/buy/.


In [3]:
train_data, test_data = sales.random_split(.8,seed=0)

In [4]:
example_features = ['sqft_living','bedrooms','bathrooms']

## Learning a multiple regression model

In [5]:
example_model = graphlab.linear_regression.create(train_data,target='price',features=example_features,validation_set=None)

# 다변수(여러 개의 feature들)로 이루어진 회귀 모델을 fitting하는 단계. 

In [6]:
# data를 모델에 성공적으로 적용했으므로, 회귀 가중치(regression weights;coefficients)를 뽑아낼 수 있다.

example_weight_summary = example_model.get("coefficients")

In [7]:
print example_weight_summary

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | 87910.0724924  |  7873.3381434 |
| sqft_living |  None | 315.403440552  | 3.45570032585 |
|   bedrooms  |  None | -65080.2155528 | 2717.45685442 |
|  bathrooms  |  None | 6944.02019265  | 3923.11493144 |
+-------------+-------+----------------+---------------+
[4 rows x 4 columns]



In [8]:
example_predictions = example_model.predict(train_data)

In [9]:
print example_predictions[0]

271789.505878


In [46]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    predicted_data = model.predict(data)
    sum_of_predicted_data = predicted_data.sum()
    print sum_of_predicted_data
    
    # Then compute the residuals/errors
    rss = outcome.sum() - sum_of_predicted_data
    print "original data.sum() is : " + str(outcome.sum())
    print "residual : " + str(rss)
    
    # Then square and add them up
    squared_rss = rss*rss
    RSS = squared_rss 
    print "rss is " +str(squared_rss)
    
    return RSS

In [11]:
rss_example_train = get_residual_sum_of_squares(example_model, test_data, test_data['price'])

2277972837.72
original data.sum() is : 2296575546.0
residual : 18602708.2762
3.46060755208e+14


In [12]:
print rss_example_train

#                나의 결과         |     나왔어야 할 값
#RSS       |  3.46060755208e+14    |     2.7376153833e+14
#residual  |  1.86027082762e+7     |     1.6545740791e+7
#predicted |  2.27797283772e+9     |     2.27797283772e+9
#무시해도 괜찮은 수준인가..

3.46060755208e+14


# Create some new features

In [13]:
from math import log

In [21]:
# interaction feature를 이용하기 위해 이런식의 조작을 한다.

# bedrooms * bedrooms
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

# bedrooms * bathrooms
train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']

# log of squarefeet
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))

# latitude
train_data['lat_plus_long'] = train_data['lat']+train_data['long']
test_data['lat_plus_long'] = test_data['lat']+test_data['long']

In [25]:
mean1 = test_data['bedrooms_squared'].mean()
mean2 = test_data['bed_bath_rooms'].mean()
mean3 = test_data['log_sqft_living'].mean()
mean4 = test_data['lat_plus_long'].mean()

print "each of the test_data's mean is : "
print  str(mean1)
print  str(mean2)
print  str(mean3)
print  str(mean4)

each of the test_data's mean is : 
12.4466777016
7.50390163159
7.55027467965
-74.6533349722


#Learning multiple models
#### [1] feature 설정

In [40]:
# 이제, 다변수 집 값 예측을 위하여 아래와 같은 과정을 거친다.

# model #1은 세 모델 중 가장 적은 개수의 feature를 갖게 한다. model1이 갖는 feature는 다음과 같다.
#  --- squarefeet, # bedrooms, #bathrooms, latitude, longtitude

# model #2는 첫 번째 모델에서 feature 한 개가 더 추가 된 모델이다. ; bedrooms*bathrooms

# model #3는 두 번째 모델에서 여러 개의 feature가 더 추가 된 모델이다. 다음과 같은 feature들을 갖는다.
# --- log squarefeet, bedrooms squared, latitude+longtitude

model_1_feature = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_feature = model_1_feature+['bed_bath_rooms']
model_3_feature = model_2_feature+['log_sqft_living', 'bedrooms_squared', 'lat_plus_long']


####[2] 모델 생성

In [41]:
model_1 = graphlab.linear_regression.create(train_data, target='price', features=model_1_feature, validation_set=None)

In [42]:
model_2 = graphlab.linear_regression.create(train_data, target='price', features=model_2_feature, validation_set=None)

In [43]:
model_3 = graphlab.linear_regression.create(train_data, target='price', features=model_3_feature, validation_set=None)

####[3] weights(coefficients) 추출

In [44]:
model_1_coefficients = model_1.get('coefficients')
model_2_coefficients = model_2.get('coefficients')
model_3_coefficients = model_3.get('coefficients')

print "coefficients of model 1 is following"
print model_1_coefficients

print "coefficients of model 2 is following"
print model_2_coefficients

print "coefficients of model 3 is following"
print model_3_coefficients

coefficients of model 1 is following
+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | -56140675.7444 | 1649985.42028 |
| sqft_living |  None | 310.263325778  | 3.18882960408 |
|   bedrooms  |  None | -59577.1160683 | 2487.27977322 |
|  bathrooms  |  None | 13811.8405419  | 3593.54213297 |
|     lat     |  None | 629865.789485  | 13120.7100323 |
|     long    |  None | -214790.285186 | 13284.2851607 |
+-------------+-------+----------------+---------------+
[6 rows x 4 columns]

coefficients of model 2 is following
+----------------+-------+----------------+---------------+
|      name      | index |     value      |     stderr    |
+----------------+-------+----------------+---------------+
|  (intercept)   |  None | -54410676.1152 | 1650405.16541 |
|  sqft_living   |  None | 304.449298056  | 3.20217535637 |
|    bedrooms    |  None | -116366

# Comparing multiple models
## evaluate which model is best
####[1] RSS

#### on Testing data

In [47]:
rss_example_model_1 = get_residual_sum_of_squares(model_1, test_data, test_data['price'])

2291043421.74
original data.sum() is : 2296575546.0
residual : 5532124.25967
rss is 3.06043988244e+13


In [48]:
rss_example_model_2 = get_residual_sum_of_squares(model_2, test_data, test_data['price'])

2290440224.87
original data.sum() is : 2296575546.0
residual : 6135321.13281
rss is 3.76421654027e+13


In [49]:
rss_example_model_3 = get_residual_sum_of_squares(model_3, test_data, test_data['price'])

2285802201.22
original data.sum() is : 2296575546.0
residual : 10773344.7835
rss is 1.16064957824e+14


#### on Training data

In [50]:
rss_example_model_1 = get_residual_sum_of_squares(model_1, train_data, train_data['price'])

9376349465.0
original data.sum() is : 9376349465.0
residual : 0.000225067138672
rss is 5.06552169099e-08


In [51]:
get_residual_sum_of_squares(model_2, train_data, train_data['price'])

9376349465.0
original data.sum() is : 9376349465.0
residual : 0.000263214111328
rss is 6.92816684023e-08


6.928166840225458e-08

In [52]:
get_residual_sum_of_squares(model_3, train_data, train_data['price'])

9376349465.0
original data.sum() is : 9376349465.0
residual : -0.000200271606445
rss is 4.01087163482e-08


4.0108716348186135e-08