In [12]:
%run data_read.py

X_train, X_test, y_train, y_test = train_test_split(data['comment'], data['score'], test_size=0.2, random_state=42)

# Number of cross-validation folds
cv = 5
n_jobs = -1
verbose = 1

current_dir = os.getcwd()
model_save_dir = os.path.join(current_dir, "..", "model")

Data path: C:\Users\Pop\Documents\GitHub\utcc_independent_study\training\..\data\data.xlsx
data_size: 1000
variable: data train_data test_data


1. Linear Model

In [13]:
pipeline_linear = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', LinearRegression())
])

param_grid_linear = {
    'regressor__fit_intercept': [True, False],
    'regressor__copy_X': [True, False]
}

# Perform the grid search for Linear Regression
print('Performing grid search for Linear Regression...')
grid_search_linear = GridSearchCV(
    pipeline_linear,
    param_grid=param_grid_linear,
    n_jobs=n_jobs,
    verbose=verbose
)

# Fit the grid search object on the training data
start_time = time.time()
grid_search_linear.fit(X_train, y_train)
end_time = time.time()

train_time = end_time - start_time

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for Linear Regression: {grid_search_linear.best_params_}')
print(f'Mean test score: {grid_search_linear.best_score_:.4f}')

# Evaluate the best model on the test set
start_time = time.time()
y_pred = grid_search_linear.predict(X_test)
end_time = time.time()

test_time = end_time - start_time

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'Linear Regression MSE on test set: {mse:.4f}')
print(f'Linear Regression MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_linear.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)

print('train: %.4f' % train_time)
print('test: %.4f' % test_time)

for index, row in sorted_df.iterrows():
    break
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')

joblib.dump(grid_search_linear, os.path.join(model_save_dir, 'linear_model.pkl'))

Performing grid search for Linear Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best hyperparameters for Linear Regression: {'regressor__copy_X': True, 'regressor__fit_intercept': False}
Mean test score: 0.0205
Linear Regression MSE on test set: 6.5504
Linear Regression MAE on test set: 2.0323
train: 15.2036
test: 0.1753


['C:\\Users\\Pop\\Documents\\GitHub\\utcc_independent_study\\training\\..\\model\\linear_model.pkl']

2. Support Vector Machine

In [4]:
# Define the pipeline and parameter search space for SVR
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', SVR())
])

param_grid_svm = {
    'regressor__kernel': ['linear', 'poly', 'rbf'],
    'regressor__C': [0.1, 1, 10],
    'regressor__gamma': ['scale', 'auto'],
}

# Perform the grid search for SVR
print('Performing grid search for SVR...')
grid_search_svm = GridSearchCV(
    pipeline_svm,
    param_grid=param_grid_svm,
    cv=cv,
    n_jobs=n_jobs,
    verbose=verbose
)

# Fit the grid search object on the training data
start_time = time.time()
grid_search_svm.fit(X_train, y_train)
end_time = time.time()

train_time = end_time - start_time

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for SVR: {grid_search_svm.best_params_}')
print(f'Mean test score: {grid_search_svm.best_score_:.4f}')

# Evaluate the best model on the test set
start_time = time.time()
y_pred = grid_search_svm.predict(X_test)
end_time = time.time()

test_time = end_time - start_time

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'SVM MSE on test set: {mse:.4f}')
print(f'SVM MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_svm.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the sorted DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)

print('train: %.4f' % train_time)
print('test: %.4f' % test_time)

for index, row in sorted_df.iterrows():
    break
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')
    
joblib.dump(grid_search_svm, os.path.join(model_save_dir, 'svm_model.pkl'))

Performing grid search for SVR...
Fitting 5 folds for each of 18 candidates, totalling 90 fits




Best hyperparameters for SVR: {'regressor__C': 1, 'regressor__gamma': 'scale', 'regressor__kernel': 'linear'}
Mean test score: 0.4926
SVM MSE on test set: 2.2989
SVM MAE on test set: 1.2287
train: 3.3642
test: 0.0429


['C:\\Users\\Pop\\Documents\\GitHub\\utcc_independent_study\\training\\..\\model\\svm_model.pkl']

In [5]:
pipeline_tree = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', DecisionTreeRegressor())
])

param_grid_tree = {
    'regressor__max_depth': [5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

# Perform the grid search for DecisionTreeRegressor
print('Performing grid search for DecisionTreeRegressor...')
grid_search_tree = GridSearchCV(
    pipeline_tree,
    param_grid=param_grid_tree,
    cv=cv,
    n_jobs=n_jobs,
    verbose=verbose
)

# Fit the grid search object on the training data
start_time = time.time()
grid_search_tree.fit(X_train, y_train)
end_time = time.time()

train_time = end_time - start_time

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for DecisionTreeRegressor: {grid_search_tree.best_params_}')
print(f'Mean test score: {grid_search_tree.best_score_:.4f}')

# Evaluate the best model on the test set
start_time = time.time()
y_pred = grid_search_tree.predict(X_test)
end_time = time.time()

test_time = end_time - start_time

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'DecisionTreeRegressor MSE on test set: {mse:.4f}')
print(f'DecisionTreeRegressor MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_tree.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the sorted DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)

print('train: %.4f' % train_time)
print('test: %.4f' % test_time)

for index, row in sorted_df.iterrows():
    break
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')
    
joblib.dump(grid_search_tree, os.path.join(model_save_dir, 'tree_model.pkl'))

Performing grid search for DecisionTreeRegressor...
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best hyperparameters for DecisionTreeRegressor: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10}
Mean test score: 0.1556
DecisionTreeRegressor MSE on test set: 4.8232
DecisionTreeRegressor MAE on test set: 1.5534
train: 2.2123
test: 0.0380


['C:\\Users\\Pop\\Documents\\GitHub\\utcc_independent_study\\training\\..\\model\\tree_model.pkl']

In [6]:
pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', KNeighborsRegressor())
])

param_grid_knn = {
    'regressor__n_neighbors': [5, 10, 15],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__p': [1, 2]
}

# Perform the grid search for KNeighborsRegressor
print('Performing grid search for KNeighborsRegressor...')
grid_search_knn = GridSearchCV(
    pipeline_knn,
    param_grid=param_grid_knn,
    cv=cv,
    n_jobs=n_jobs,
    verbose=verbose
)

# Fit the grid search object on the training data
start_time = time.time()
grid_search_knn.fit(X_train, y_train)
end_time = time.time()

train_time = end_time - start_time

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for KNeighborsRegressor: {grid_search_knn.best_params_}')
print(f'Mean test score: {grid_search_knn.best_score_:.4f}')

# Evaluate the best model on the test set
start_time = time.time()
y_pred = grid_search_knn.predict(X_test)
end_time = time.time()

test_time = end_time - start_time

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'KNeighborsRegressor MSE on test set: {mse:.4f}')
print(f'KNeighborsRegressor MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_knn.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the sorted DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)

print('train: %.4f' % train_time)
print('test: %.4f' % test_time)

for index, row in sorted_df.iterrows():
    break
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')
    
joblib.dump(grid_search_knn, os.path.join(model_save_dir, 'knn_model.pkl'))

Performing grid search for KNeighborsRegressor...
Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best hyperparameters for KNeighborsRegressor: {'regressor__n_neighbors': 15, 'regressor__p': 2, 'regressor__weights': 'distance'}
Mean test score: 0.1095
KNeighborsRegressor MSE on test set: 3.9512
KNeighborsRegressor MAE on test set: 1.5595
train: 2.2369
test: 0.1621


['C:\\Users\\Pop\\Documents\\GitHub\\utcc_independent_study\\training\\..\\model\\knn_model.pkl']

In [14]:
# os.path.join(model_save_dir, 'linear_model.pkl')
# os.path.join(model_save_dir, 'knn_model.pkl')
# os.path.join(model_save_dir, 'tree_model.pkl')
# os.path.join(model_save_dir, 'svm_model.pkl')

model = joblib.load(os.path.join(model_save_dir, 'linear_model.pkl'))

# Make predictions on new data
new_data = ['สินค้าดี เจ๋งสุดๆ', 'ไม่ดี']
predictions = model.predict(new_data)

# Print the predicted scores and comments
for i in range(len(new_data)):
    print(f"c: {new_data[i]}")
    print(f"p: {predictions[i]}")
    print('')

c: สินค้าดี เจ๋งสุดๆ
p: 10.117131330550155

c: ไม่ดี
p: 1.8893167150926962

