In [12]:
#!pip install pandas

In [30]:
%%time
import pandas as pd

data = pd.read_csv("house_prices.csv", index_col=0)
data = data.rename(columns={"Area_in_Marla": "area"}) #เปลี่ยนชื่อคอลลัม
data = data[data["area"] > 0]
data = data.sample(frac=0.1, random_state=0)
data

CPU times: total: 141 ms
Wall time: 137 ms


Unnamed: 0,property_type,price,location,city,baths,purpose,bedrooms,area
371,House,250000,DHA Defence,Karachi,7,For Rent,6,20.0
108296,House,7500000,Samanabad,Lahore,3,For Sale,3,2.0
160098,Flat,58000,Clifton,Karachi,3,For Rent,3,8.0
77478,Lower Portion,35000,Al Najaf Colony,Faisalabad,3,For Rent,3,6.0
157504,Upper Portion,55000,G-10,Islamabad,2,For Rent,3,10.9
...,...,...,...,...,...,...,...,...
84070,House,13000000,DHA Defence,Lahore,4,For Sale,3,7.0
58065,Flat,40000,DHA Defence,Islamabad,3,For Rent,3,11.8
118219,House,13000000,Bahria Town,Lahore,7,For Sale,5,5.0
62097,Upper Portion,95000,E-11,Islamabad,4,For Rent,3,20.0


In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.3, random_state=0) # แบ่งข้อมูล
test_data, valid_data = train_test_split(test_data, test_size=0.5, random_state=0)
len(train_data), len(valid_data), len(test_data)

(6964, 1493, 1492)

In [4]:
features = ["property_type", "baths", "bedrooms", "city", "area"] # 🤔 เปลี่ยนง่ายๆ แก้ที่เดียว
train_data[features]

Unnamed: 0,property_type,baths,bedrooms,city,area
161486,Flat,3,3,Karachi,10.0
167652,House,3,3,Lahore,5.0
106280,Flat,3,3,Karachi,7.1
43780,Flat,1,1,Islamabad,1.6
133386,House,5,5,Lahore,10.0
...,...,...,...,...,...
6750,Lower Portion,3,3,Islamabad,14.2
34789,House,5,4,Rawalpindi,5.0
47245,House,7,5,Karachi,4.8
54123,Flat,1,2,Karachi,3.0


In [5]:
label = "price" # ประกาศเฉลย
train_data[label]

161486       36000
167652     5000000
106280     3800000
43780      1750000
133386    20000000
            ...   
6750         70000
34789      7500000
47245     21200000
54123      5200000
9739        150000
Name: price, Length: 6964, dtype: int64

In [32]:
#!pip install sklearn

In [37]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer as TfidfVec, HashingVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor

def create_pipeline():
    return make_pipeline(
        make_column_transformer(
            # https://scikit-learn.org/stable/modules/preprocessing.html
            (OneHotEncoder(min_frequency=0.01), ["property_type"]),
            # (TfidfVec(ngram_range=(1, 3), min_df=5, max_df=0.5, sublinear_tf=True), "location"),
            (HashingVectorizer(n_features=5), "city"), # Hyperparamitter
             (StandardScaler(), ["baths"]),    
            # (StandardScaler(), ["bedrooms"]),
            # (StandardScaler(), ["area"]),
            remainder="passthrough",
        ),
        # https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
        #SGDRegressor(random_state=0),
        DecisionTreeRegressor(random_state=0), 
    )
 
pipeline = create_pipeline()
pipeline

In [7]:
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('onehotencoder',
                                    OneHotEncoder(min_frequency=0.01),
                                    ['property_type']),
                                   ('hashingvectorizer',
                                    HashingVectorizer(n_features=5), 'city')])),
  ('decisiontreeregressor', DecisionTreeRegressor(random_state=0))],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder',
                                  OneHotEncoder(min_frequency=0.01),
                                  ['property_type']),
                                 ('hashingvectorizer',
                                  HashingVectorizer(n_features=5), 'city')]),
 'decisiontreeregressor': DecisionTreeRegressor(random_state=0),
 'columntransformer__force_int_remainder_cols': True,
 'colum

In [39]:
%time pipeline.fit(train_data[features], train_data[label]) # วัดเวลาแค่บรรทัดเดียว เพราะใช้ % ตัวเดียว
score = pipeline.score(train_data[features], train_data[label])
print(f"{score:.3f}")
#score เลขละเอียดเกินไป

CPU times: total: 78.1 ms
Wall time: 115 ms
0.638


In [9]:
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

params = dict(
    # sgdregressor__loss=["squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"],
    # sgdregressor__penalty=["l2", "l1", "elasticnet"],
    # sgdregressor__alpha=[0.0001, 0.001, 0.00001],
    # sgdregressor__l1_ratio=[0.1, 0.5, 0.9],
    decisiontreeregressor__criterion=["squared_error", "friedman_mse", "absolute_error", "poisson"],
    decisiontreeregressor__max_features=["auto", "sqrt", "log2"],
)

pipeline = create_pipeline()

# https://scikit-learn.org/stable/modules/grid_search.html
search = RandomizedSearchCV(pipeline, params, n_iter=10, random_state=0)
%time search.fit(train_data[features], train_data[label])
search.best_params_, f"{search.best_score_:.3f}"

CPU times: total: 5.22 s
Wall time: 5.55 s


({'decisiontreeregressor__max_features': 'log2',
  'decisiontreeregressor__criterion': 'poisson'},
 '0.354')

In [10]:
search.best_estimator_.steps[-1][1].n_features_in_ # แสดงจำนวน feature ที่ model รับเข้าไป รวมกับ ที่แตกมาจาก OneHotEndCoder(สร้างคอลลัมเพิ่ม) , hashing 
# จำนวน feature ที่ model เรียนรู้จริงๆ 

13

In [11]:
from sklearn.inspection import permutation_importance as per_imp

imps = per_imp(search.best_estimator_, valid_data[features], valid_data[label], random_state=0)
imps = pd.Series(imps.importances_mean, index=features).sort_values(ascending=False)
imps.plot(kind="bar")

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [None]:
samples = valid_data.sample(5, random_state=0)
samples

In [None]:
predicted = search.best_estimator_.predict(samples)
predicted

In [None]:
def predict(sample):
    predicted = search.best_estimator_.predict(sample)[0]
    # price = price_k.inverse_transform([[price]])
    predicted = int(predicted)
    return predicted

In [None]:
sample = {
    "city": "Lahore",
    # "location": "DHA Defence",
    "property_type": "House",
    "baths": 1,
    "bedrooms": 1,
    "area": 1.0,
}

sample = pd.DataFrame([sample])
predicted = predict(sample)
predicted

In [None]:
sample = {
    "city": "Lahore",
    # "location": "DHA Defence",
    "property_type": "House",
    "baths": 5,
    "bedrooms": 5,
    "area": 20.0,
}

sample = pd.DataFrame([sample])
predicted = predict(sample)
predicted

In [None]:
%%time
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import LearningCurveDisplay, ShuffleSplit

fig, ax = plt.subplots(figsize=(8, 6), sharey=True)

pipeline = create_pipeline()
pipeline.set_params(**search.best_params_)

LearningCurveDisplay.from_estimator(
    estimator=pipeline,
    X=train_data[features],
    y=train_data[label],
    train_sizes=np.linspace(0.1, 1.0, 5),
    cv=ShuffleSplit(n_splits=50, test_size=0.2, random_state=0),
    score_type="both",
    line_kw=dict(marker="o"),
    std_display_style="fill_between",
    score_name="Accuracy",
    ax=ax,
)

handles, label = ax.get_legend_handles_labels()
ax.legend(handles[:2], ["Training Score", "Test Score"])
ax.set_ylim(ymin=0, ymax=1)