In [1]:
from datetime import datetime
import pickle
import pandas as pd
import sklearn.model_selection
import sklearn.linear_model
import sklearn.metrics
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import glob
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Odczytanie danych z plików
PATH = r'C:\Users\Wojtek\Google Drive\Studia\Semestr 4\Technologie_Sieciowe_Lab_A_Szwabe\REEML\uploads'

all_files = glob.glob(PATH + "/*")

all_files

['C:\\Users\\Wojtek\\Google Drive\\Studia\\Semestr 4\\Technologie_Sieciowe_Lab_A_Szwabe\\REEML\\uploads\\ceny_mieszkan_w_poznaniu.tsv',
 'C:\\Users\\Wojtek\\Google Drive\\Studia\\Semestr 4\\Technologie_Sieciowe_Lab_A_Szwabe\\REEML\\uploads\\test5.tsv']

In [3]:
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, sep='\t')
    li.append(df)

r = pd.concat(li, axis=0, ignore_index=True)
print("[",datetime.now().strftime("%d/%b/%y %H:%M:%S") ,"] Rows count for data model training: [", r.__len__(),"]")

[ 28/May/19 18:53:40 ] Rows count for data model training: [ 1674 ]


## Linear Regression (with Feature Enchancing, Pipelineing)

In [4]:
r_train, r_test = sklearn.model_selection.train_test_split(r, test_size=0.2, random_state=2)

In [5]:
model = sklearn.linear_model.LinearRegression()

In [6]:
class FeatureEnhancer(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):      
        
        def get_is_center(district):
            if district is "Wilda":
                return 1
            return 0
        
        out = X.copy()
        out['is_center'] = X['location'].map(get_is_center)
        #print(out)
        return out[['sqrMeters','rooms','is_center']]

In [7]:
features = FeatureEnhancer()
p = Pipeline([
    ('feature selection', features),
    ('regression', model)
])

In [8]:
label = ['price']
x_train = r_train  # [features]
y_train = r_train[label].values.reshape(-1, 1)
y_test = r_test[label].values.reshape(-1, 1)

In [9]:
p.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('feature selection', FeatureEnhancer()),
                ('regression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [10]:
print(p.score(r_test, y_test))

0.2749055683193399


In [11]:
input_df = pd.DataFrame({
    'isNew': pd.Series([False]),
    'rooms': pd.Series([4]),
    'floor': pd.Series([0]),
    'location': pd.Series(['Centrum']),
    'sqrMeters': pd.Series([60])
})

p.predict(input_df)

array([[486318.75427999]])

## Gradient Boosting Regressor

### Encoding Locations

In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [13]:
encoded_locations = le.fit(r['location']) 

In [14]:
le.classes_

array(['Antoninek', 'Bonin', 'Centrum', 'Chwaliszewo', 'Dolna',
       'Dąbrowskiego', 'Dębiec', 'Garbary', 'Grunwald', 'Górczyn',
       'Głuszyna', 'Jeżyce', 'Kobylepole', 'Komandoria', 'Komorniki',
       'Kopernika', 'Malta', 'Morasko', 'Nadolnik', 'Naramowice', 'Nowe',
       'Ogrody', 'Piatkowo', 'Piątkowo', 'Plewiska', 'Podolany', 'Rataje',
       'Smochowice', 'Sołacz', 'Stare', 'Starołęka', 'Strzeszyn', 'Suchy',
       'Szczepankowo', 'Wilczak', 'Wilda', 'Winiary', 'Winogrady',
       'Zawady', 'Ławica', 'Łazarz', 'Śródka', 'Świerczewo', 'Żegrze'],
      dtype=object)

In [15]:
encoded_locations = le.transform(r['location'])
encoded_locations

array([ 2, 28, 28, ...,  1, 25,  2])

In [16]:
le.inverse_transform(encoded_locations)

array(['Centrum', 'Sołacz', 'Sołacz', ..., 'Bonin', 'Podolany', 'Centrum'],
      dtype=object)

In [17]:
r['location'] = le.transform(r['location'])

In [18]:
train1 = r
labels = r['price']
train1 = r.drop(['price'],axis=1)
x_train , x_test , y_train , y_test = sklearn.model_selection.train_test_split(train1 , labels , test_size = 0.10,random_state =2)

In [19]:
train1.head()

Unnamed: 0,isNew,rooms,floor,location,sqrMeters
0,False,3,1,2,78
1,False,3,2,28,62
2,False,3,0,28,15
3,False,4,0,28,14
4,False,3,0,28,15


In [20]:
labels.head()

0    476118.0
1    459531.0
2    411557.0
3    496416.0
4    406032.0
Name: price, dtype: float64

In [21]:
y_train.head()

1230    263581.0
111     549000.0
1181    249000.0
795     489000.0
1150    329000.0
Name: price, dtype: float64

### Train model using GradienBoostingRegressor

In [22]:
clf = GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')

In [23]:
clf.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=400,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [24]:
clf.score(x_test,y_test)

0.363171441708651

In [25]:
input_df = pd.DataFrame({
    'isNew': pd.Series([False]),
    'rooms': pd.Series([4]),
    'floor': pd.Series([0]),
    'location': pd.Series(le.transform(['Centrum'])),
    'sqrMeters': pd.Series([60])
})

input_df

Unnamed: 0,isNew,rooms,floor,location,sqrMeters
0,False,4,0,2,60


In [26]:
clf.predict(input_df)

array([590429.8832152])

## Gradient Boosting Classifier

In [27]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

In [28]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(x_train, y_train)

ValueError: Unknown label type: 'continuous'

In [29]:
clf.score(x_test, y_test)  

0.363171441708651

## Save Model

In [30]:
pickle.dump(p, open("model.pkl", "wb"))

In [31]:
r.head()

Unnamed: 0,price,isNew,rooms,floor,location,sqrMeters
0,476118.0,False,3,1,2,78
1,459531.0,False,3,2,28,62
2,411557.0,False,3,0,28,15
3,496416.0,False,4,0,28,14
4,406032.0,False,3,0,28,15
