In [1]:
import pandas as pd
import numpy as np 
from sklearn import datasets, linear_model, metrics 
from sklearn.model_selection import train_test_split

In [2]:
def read_data(X_path, Y_path):
    X = pd.read_csv(X_path, index_col='id')
    Y = pd.read_csv(Y_path, index_col='id')
    
    return X, Y

In [3]:
def train_model(X_train, Y_train):    
    return linear_model.LinearRegression().fit(X_train, Y_train)

In [4]:
def model_validation(model, X_train, X_test, Y_train, Y_test):
    in_sample_prediction = model.predict(X_train)
    mse = np.mean((Y_train.to_numpy() - in_sample_prediction)**2)
    
    print("In-sample match: ", model.score(X_train, Y_train)*100, "%")
    print("In-sample mse: ", mse)
    print("In-sample rmse: ", np.sqrt(mse))
    print('---------------------------------------')
    prediction = model.predict(X_test)
    mse_out = np.mean((Y_test.to_numpy() - prediction)**2)
    print("Out-sample match: ", model.score(X_test, Y_test)*100, "%")
    print("Out-sample mse: ", mse_out)
    print("Out-sample rmse: ",np.sqrt(mse_out))

# Model 1

### Đặc điểm model?

In [5]:
X, Y = read_data("X_train.csv", "Y_train.csv")

In [6]:
def preprocess_data(X, Y): # Làm sao khi có file test riêng biết tạo bao nhiêu cột?
    X_df = X[['odometer', 'year', 'engineCapacity', 
                'feature_0', 'feature_1', 'feature_2', 
                'feature_3', 'feature_4', 'feature_5', 
                'feature_6', 'feature_7', 'feature_8', 'feature_9']].copy()
    
    categorical_cols = ['manufacturer', 'transmission', 'color', 'bodyType', 'drivetrain', 'engineFuel']
    for col in categorical_cols:
        for val in X[col].unique():
            X_df['is' + val] = (X[col] == val)
            
    X_df = X_df.astype(float)
    
    index = [i for i, row in X_df.iterrows() if row.isnull().any()]
    Y_df = Y.drop(index)
    X_df = X_df.dropna()
    
    return X_df, Y_df

In [7]:
X_df, Y_df = preprocess_data(X, Y)

# Không được tách mẫu bằng data đã xử lí
# Tách rồi mới xử lí để biết có thể tự xử lí riêng test hay ko
# Y_test cần drop cột nào khi X_test xử lí riêng có dropna?
X_train, X_test, Y_train, Y_test = train_test_split(X_df, Y_df, test_size=0.2, random_state=0)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((23992, 97), (23992, 1), (5999, 97), (5999, 1))

In [8]:
regressor_1 = train_model(X_train, Y_train)

In [9]:
model_validation(regressor_1, X_train, X_test, Y_train, Y_test)

In-sample match:  75.43855536366775 %
In-sample mse:  9154609.666163972
In-sample rmse:  3025.658550822279
---------------------------------------
Out-sample match:  75.8540408509474 %
Out-sample mse:  8807896.382743344
Out-sample rmse:  2967.8100314446247


# Model 2

### Model chỉ dùng các cột dữ liệu số làm tham số huấn luyện

In [10]:
X, Y = read_data("X_train.csv", "Y_train.csv")

In [11]:
def model_2_training_process(X, Y):
    X_train = X[['odometer', 'year', 'engineCapacity', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']].copy().astype(float)
    X_train['x0'] = 1
    X_train['year'] -= min(X['year'])
    
    index = [i for i, row in X_train.iterrows() if row.isnull().any()]
    Y_train = Y.drop(index)
    X_train = X_train.dropna()
    
    return X_train, Y_train

def model_2_process(X):
    X_train = X[['odometer', 'year', 'engineCapacity', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']].copy().astype(float)
    X_train['x0'] = 1
    X_train['year'] -= min(X['year'])
    
    drop_index = [i for i, row in X_train.iterrows() if row.isnull().any()]
    X_train = X_train.dropna()
    
    return X_train, drop_index

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

X_train, Y_train = model_2_training_process(X_train, Y_train)

X_test, drop_inx = model_2_process(X_test)
Y_test = Y_test.drop(drop_inx)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((23994, 14), (23994, 1), (5997, 14), (5997, 1))

In [13]:
regressor_2 = train_model(X_train, Y_train)

In [14]:
model_validation(regressor_2, X_train, X_test, Y_train, Y_test)

In-sample match:  67.63542428064184 %
In-sample mse:  12032033.047601737
In-sample rmse:  3468.7221058484547
---------------------------------------
Out-sample match:  67.4735839975811 %
Out-sample mse:  11983445.001762254
Out-sample rmse:  3461.7112822652116


# Model 3

### Tự tính weight cho nhãn
$$ w_{Nhãn} = \frac{Trung \, Bình \, Giá(Nhãn)}{Nhãn_{min} \, trong \, cột} $$

In [15]:
import pickle
import os.path

In [16]:
def scale_nominal(X, Y, col):
    label = X[col].unique()
    scale = {key: 1e0 for key in label}
    
    for l in label:
        ind = X.index[X[col] == l].to_list()
        s = sum(Y.iloc[i-1]['price'] for i in ind)
        scale[l] = s/len(ind)
        
    min_l = min(scale, key=scale.get)
    base = scale[min_l]
    
    for key in scale.keys():
        scale[key] /= base
    return scale

In [17]:
def calculate_nominal(X, save_to = None):
    label_col = ["manufacturer",  "model","transmission","color","engineFuel","engineType","bodyType","drivetrain"]
    
    X_train = X.copy()
    X_train = X_train.dropna()
    
    scale = {key: 1e0 for key in label_col}
    for s in label_col:
        scale[s] = scale_nominal(X_train, Y, s)
        
    if save_to != None:
        output = open(save_to, 'wb')
        pickle.dump(scale, output)
        
    return scale

In [18]:
def model_self_labeled_process(X, scale):
    X_train = X.copy()
    X_train.drop(["photos"], axis=1)
    index = [i for i, row in X_train.iterrows() if row.isnull().any()]

    X_train = X_train.dropna()
    X_train['year'] -= min(X['year'])
    
    label_col = ["manufacturer", "model","transmission","color","engineFuel","engineType","bodyType","drivetrain"]
    
    for s in label_col:
        d = scale[s]
        for key in d.keys():
            X_train.replace(key, d[key], inplace=True)
        for u in X_train[s].unique():
            try:
                float(u)
            except ValueError:
                # print("Droping ", u)
                idx = X_train[ X_train[s] == u ].index
                X_train.drop(idx , inplace=True)
                index.append(idx)
        
    X_train.replace(True, 1, inplace=True)
    X_train.replace(False, 0, inplace=True)
    X_train['x0'] = 1
    
    X_train = X_train.astype(float)
    
    return X_train, index

In [19]:
X, Y = read_data("X_train.csv", "Y_train.csv")

In [20]:
fname = "model_scaling.pkl"
if not os.path.isfile(fname):    
    scale = calculate_nominal(X, fname)
    print("Saved scale")
else:
    print("Loaded scale")
    scale = pickle.load(open(fname, 'rb'))

Loaded scale


In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((24000, 22), (24000, 1), (6000, 22), (6000, 1))

In [22]:
X_train, drop_idx = model_self_labeled_process(X_train, scale)
Y_train = Y_train.drop(drop_idx)

X_train.shape, Y_train.shape

((23994, 23), (23994, 1))

In [23]:
regressor_3 = train_model(X_train, Y_train)

In [24]:
X_test, drop_idx = model_self_labeled_process(X_test, scale)
Y_test = Y_test.drop(drop_idx)

X_test.shape, Y_test.shape

((5997, 23), (5997, 1))

In [25]:
model_validation(regressor_3, X_train, X_test, Y_train, Y_test)

In-sample match:  77.66415806315936 %
In-sample mse:  8303695.702994527
In-sample rmse:  2881.613385413548
---------------------------------------
Out-sample match:  77.68226816774353 %
Out-sample mse:  8222341.863795728
Out-sample rmse:  2867.4626176806087


# Model 4

### Đặc điểm?

In [26]:
X, Y = read_data("X_train.csv", "Y_train.csv")

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [28]:
features = []
for i in range(10):
    feature = f'feature_{i}'
    features.append(feature)
nAn_features = ["manufacturer", "transmission", "color", "engineFuel", "engineType", "bodyType", "drivetrain"]
nAn_features_all = nAn_features + features

In [29]:
def transfrom_data(X, pipeline):
    return pipeline.transform(X)

In [30]:
fname = "pipeline.pkl"
X_num = X[['odometer', 'year', 'engineCapacity']].astype(float)
if not os.path.isfile(fname):
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
        ('std_scaler', StandardScaler()),
    ])

    num_attribs = list(X_num)
    cat_attribs = [n for n in nAn_features_all if n != 'engineType' and n not in features]
    cat_attribs_binary = features

    full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
     ("cat_binary", OrdinalEncoder(), cat_attribs_binary),   
    ("cat", OneHotEncoder(), cat_attribs),
    ])
    full_pipeline.fit(X)
    pickle.dump(full_pipeline, open(fname, 'wb'))
    print("Saved pipeline")
else:
    full_pipeline = pickle.load(open(fname, 'rb'))
    print("Loaded pipeline")

Loaded pipeline


In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((24000, 22), (24000, 1), (6000, 22), (6000, 1))

In [32]:
X_prepared = transfrom_data(X_train, full_pipeline)
X_prepared.shape

(24000, 113)

In [33]:
regressor_4 = train_model(X_prepared, Y_train)

In [34]:
X_test_transform = transfrom_data(X_test, full_pipeline)

model_validation(regressor_4, X_prepared, X_test_transform, Y_train, Y_test)

In-sample match:  86.6801100430009 %
In-sample mse:  4953932.207964233
In-sample rmse:  2225.7430687220467
---------------------------------------
Out-sample match:  87.00055743489553 %
Out-sample mse:  4813671.81627385
Out-sample rmse:  2194.008162307937
