In [44]:
%run data_read.py

X_train, X_test, y_train, y_test = train_test_split(data['comment'], data['score'], test_size=0.2, random_state=42)

Data path: C:\Users\POP PC\Documents\GitHub\utcc_independent_study\training\..\data\300_data_pop.xlsx
data_size: 318
variable: data train_data test_data


1. Linear Model

In [46]:
pipeline_linear = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', LinearRegression())
])

param_grid_linear = {
    'regressor__fit_intercept': [True, False],
    'regressor__copy_X': [True, False]
}

# Define the grid search parameters
cv = 3  # Number of cross-validation folds

# Perform the grid search for Linear Regression
print('Performing grid search for Linear Regression...')
grid_search_linear = GridSearchCV(
    pipeline_linear,
    param_grid=param_grid_linear,
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search object on the training data
grid_search_linear.fit(X_train, y_train)

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for Linear Regression: {grid_search_linear.best_params_}')
print(f'Mean test score: {grid_search_linear.best_score_:.4f}')

# Evaluate the best model on the test set
y_pred = grid_search_linear.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'Linear Regression MSE on test set: {mse:.4f}')
print(f'Linear Regression MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_linear.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)

for index, row in sorted_df.iterrows():
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')


Performing grid search for Linear Regression...
Fitting 3 folds for each of 4 candidates, totalling 12 fits




Best hyperparameters for Linear Regression: {'regressor__copy_X': True, 'regressor__fit_intercept': False}
Mean test score: 0.4616
Linear Regression MSE on test set: 3.0274
Linear Regression MAE on test set: 1.3422
comment: ‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏Ç‡∏≠‡∏á‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏î‡∏µ‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡πà‡∏≤‡∏î‡∏µ‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏£‡πá‡∏ß‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡πÜ ‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏ö‡∏£‡∏¥‡∏Å‡∏≤‡∏£‡∏à‡∏≤‡∏Å‡∏£‡πâ‡∏≤‡∏ô‡∏Ñ‡πâ‡∏≤‡∏î‡∏µ‡∏°‡∏≤‡∏Å ‡∏ñ‡∏π‡∏Å‡πÉ‡∏à‡∏°‡∏≤‡∏Å‡πÜ ‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á‚ú®
actual: 8
predicted: 9.0

comment: ‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡πÄ‡∏£‡πá‡∏ß‡∏Ñ‡πà‡∏∞ ‡∏Ç‡∏≠‡∏á‡πÑ‡∏î‡πâ‡∏ï‡∏£‡∏á‡∏£‡∏π‡∏õ‡∏ï‡∏£‡∏á‡∏õ‡∏Å‡∏ó‡∏±‡πâ‡∏á‡πÄ‡∏°‡∏≤‡∏™‡πå‡∏ó‡∏±‡πâ‡∏á‡πÅ‡∏õ‡πâ‡∏ô‡∏û‡∏¥‡∏°‡∏û‡πå ‡∏™‡πà‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡∏ß‡πà‡∏≤‡πÅ‡∏õ‡πâ‡∏ô‡∏û‡∏¥‡∏°‡∏û‡πå‡∏ö‡∏≤‡∏á‡πÑ‡∏õ‡∏´‡∏ô‡πà‡∏≠‡∏¢‡πÅ‡∏ï‡πà‡∏Å‡πá‡∏™‡∏°‡∏Å‡∏±‡∏ö‡∏£‡∏≤‡∏Ñ‡∏≤‡∏ô‡∏µ‡πâ‡∏Ñ‡πà‡∏∞ ‡∏°‡∏≤‡∏ó‡∏µ‡πà‡πÄ‡∏°‡∏≤‡∏™‡πå‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥‡∏ß‡πà‡∏≤‡πÉ‡∏´‡πâ‡∏´‡∏≤‡πÅ‡∏ú‡πà‡∏ô‡∏£‡∏≠‡∏á‡πÄ

2. Support Vector Machine

In [47]:
# Define the pipeline and parameter search space for SVR
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', SVR())
])

param_grid_svm = {
    'regressor__kernel': ['linear', 'poly', 'rbf'],
    'regressor__C': [0.1, 1, 10],
    'regressor__gamma': ['scale', 'auto'],
}

# Perform the grid search for SVR
print('Performing grid search for SVR...')
grid_search_svm = GridSearchCV(
    pipeline_svm,
    param_grid=param_grid_svm,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search object on the training data
grid_search_svm.fit(X_train, y_train)

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for SVR: {grid_search_svm.best_params_}')
print(f'Mean test score: {grid_search_svm.best_score_:.4f}')

# Evaluate the best model on the test set
y_pred = grid_search_svm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'SVM MSE on test set: {mse:.4f}')
print(f'SVM MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_svm.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the sorted DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)
for index, row in sorted_df.iterrows():
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')

Performing grid search for SVR...
Fitting 3 folds for each of 18 candidates, totalling 54 fits




Best hyperparameters for SVR: {'regressor__C': 1, 'regressor__gamma': 'scale', 'regressor__kernel': 'linear'}
Mean test score: 0.4728
SVM MSE on test set: 2.3056
SVM MAE on test set: 1.2300
comment: ‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏£‡∏ö‡∏ñ‡πâ‡∏ß‡∏ô‡∏Ñ‡πà‡∏∞ ‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡πÑ‡∏ß ‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏î‡∏π‡∏î‡∏µ ‡∏£‡∏≤‡∏Ñ‡∏≤‡πÑ‡∏°‡πà‡πÅ‡∏û‡∏á ‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡πà‡∏≤‡∏°‡∏≤‡∏Å‡πÜ ‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏™‡∏±‡πà‡∏á‡∏≠‡∏µ‡∏Å‡πÅ‡∏ô‡πà‡∏ô‡∏≠‡∏ô‡∏Ñ‡πà‡∏∞ ‡∏™‡∏≤‡∏¢‡∏™‡∏ß‡∏¢‡πÜ ‡∏ô‡πà‡∏≤‡∏£‡∏±‡∏Å‡πÜ
actual: 7
predicted: 7.841295234528582

comment: ‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡πÑ‡∏ß‡∏°‡∏≤‡∏Å‡∏Ñ‡πà‡∏∞ ‡∏£‡∏≤‡∏Ñ‡∏≤‡∏Å‡πá‡∏ñ‡∏π‡∏Å‡∏î‡∏µ ‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‡∏ï‡πà‡∏≠‡∏á‡πà‡∏≤‡∏¢‡∏°‡∏≤‡∏Å ‡πÄ‡∏ß‡∏•‡∏≤‡πÉ‡∏ä‡πâ‡∏Ñ‡πà‡∏≠‡∏ô‡∏Ç‡πâ‡∏≤‡∏á‡∏•‡∏∑‡πà‡∏ô ‡πÑ‡∏ß‡∏Ñ‡πà‡∏∞ ‡∏Å‡πá‡∏ñ‡∏ô‡∏±‡∏î‡∏î‡∏µ ‡πÑ‡∏ß‡πâ‡∏à‡∏∞‡∏≠‡∏∏‡∏î‡∏´‡∏ô‡∏∏‡∏ô‡πÉ‡∏´‡∏°‡πà‡∏ô‡∏∞‡∏Ñ‡∏∞
actual: 7
predicted: 7.751694725532005

comment: ‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏Ç‡∏≠‡∏á‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏î‡∏µ‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡πà‡∏≤‡∏î‡∏µ‡∏°‡∏≤

In [48]:
pipeline_tree = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', DecisionTreeRegressor())
])

param_grid_tree = {
    'regressor__max_depth': [5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

# Perform the grid search for DecisionTreeRegressor
print('Performing grid search for DecisionTreeRegressor...')
grid_search_tree = GridSearchCV(
    pipeline_tree,
    param_grid=param_grid_tree,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search object on the training data
grid_search_tree.fit(X_train, y_train)

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for DecisionTreeRegressor: {grid_search_tree.best_params_}')
print(f'Mean test score: {grid_search_tree.best_score_:.4f}')

# Evaluate the best model on the test set
y_pred = grid_search_tree.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'DecisionTreeRegressor MSE on test set: {mse:.4f}')
print(f'DecisionTreeRegressor MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_tree.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the sorted DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)
for index, row in sorted_df.iterrows():
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')

Performing grid search for DecisionTreeRegressor...
Fitting 3 folds for each of 9 candidates, totalling 27 fits




Best hyperparameters for DecisionTreeRegressor: {'regressor__max_depth': 5, 'regressor__min_samples_split': 10}
Mean test score: 0.3172
DecisionTreeRegressor MSE on test set: 4.5369
DecisionTreeRegressor MAE on test set: 1.5624
comment: ‡∏ï‡∏∞‡∏°‡∏∏‡∏ï‡∏∞‡∏°‡∏¥‡∏ô‡πà‡∏≤‡∏£‡∏±‡∏Å  ‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡πÅ‡∏à‡πà‡∏°‡∏î‡∏µ‡∏Ñ‡πà‡∏∞  ‡∏ä‡∏≠‡∏ö‡∏°‡∏≤‡∏Å  ‡∏£‡∏≤‡∏Ñ‡∏≤‡πÑ‡∏°‡πà‡πÅ‡∏û‡∏á‡∏î‡πâ‡∏ß‡∏¢
actual: 3
predicted: 9.0

comment: ‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡πÅ‡∏•‡πâ‡∏ß‡∏ß ‡∏™‡πà‡∏á‡∏Å‡πà‡∏≠‡∏ô‡∏Å‡∏≥‡∏´‡∏ô‡∏î4‡∏ß‡∏±‡∏ô‡πÄ‡∏•‡∏¢ ‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏ï‡∏£‡∏á‡∏ï‡∏≤‡∏°‡∏õ‡∏Å ‡∏£‡∏≤‡∏Ñ‡∏≤‡∏î‡∏µ‡πÑ‡∏°‡πà‡πÅ‡∏û‡∏á ‡πÄ‡∏Ñ‡∏¢‡πÉ‡∏ä‡πâ‡πÅ‡∏•‡πâ‡∏ß‡∏ä‡∏≠‡∏ö ‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏•‡∏¥‡∏õ‡πÄ‡∏õ‡∏∑‡πâ‡∏≠‡∏ô‡πÅ‡∏°‡∏™‡πÑ‡∏î‡πâ‡∏î‡∏µ
actual: 8
predicted: 9.0

comment: ‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏£‡∏ö‡∏ñ‡πâ‡∏ß‡∏ô‡∏Ñ‡πà‡∏∞ ‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡πÑ‡∏ß ‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏î‡∏π‡∏î‡∏µ ‡∏£‡∏≤‡∏Ñ‡∏≤‡πÑ‡∏°‡πà‡πÅ‡∏û‡∏á ‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡πà‡∏≤‡∏°‡∏≤‡∏Å‡πÜ ‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏™‡∏

In [49]:
pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=thai_tokenizer)),
    ('regressor', KNeighborsRegressor())
])

param_grid_knn = {
    'regressor__n_neighbors': [5, 10, 15],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__p': [1, 2]
}

# Perform the grid search for KNeighborsRegressor
print('Performing grid search for KNeighborsRegressor...')
grid_search_knn = GridSearchCV(
    pipeline_knn,
    param_grid=param_grid_knn,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search object on the training data
grid_search_knn.fit(X_train, y_train)

# Print the best hyperparameters and corresponding mean test score
print(f'Best hyperparameters for KNeighborsRegressor: {grid_search_knn.best_params_}')
print(f'Mean test score: {grid_search_knn.best_score_:.4f}')

# Evaluate the best model on the test set
y_pred = grid_search_knn.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'KNeighborsRegressor MSE on test set: {mse:.4f}')
print(f'KNeighborsRegressor MAE on test set: {mae:.4f}')

# Evaluate the best model on the test set and create DataFrame with results
result = pd.DataFrame({'comment': X_test,
                        'actual': y_test,
                        'predicted': grid_search_knn.predict(X_test)})
result['predicted'] = result['predicted'].clip(lower=0, upper=9)  # Clip predictions to valid score range of 0 to 1

# Print the sorted DataFrame with results
sorted_df = result.sort_values(by='predicted', ascending=False)
for index, row in sorted_df.iterrows():
    print(f"comment: {row['comment']}")
    print(f"actual: {row['actual']}")
    print(f"predicted: {row['predicted']}")
    print('')

Performing grid search for KNeighborsRegressor...
Fitting 3 folds for each of 12 candidates, totalling 36 fits




Best hyperparameters for KNeighborsRegressor: {'regressor__n_neighbors': 5, 'regressor__p': 2, 'regressor__weights': 'distance'}
Mean test score: 0.1307
KNeighborsRegressor MSE on test set: 4.1410
KNeighborsRegressor MAE on test set: 1.5667
comment: ‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡πÅ‡∏•‡πâ‡∏ß‡∏ß ‡∏™‡πà‡∏á‡∏Å‡πà‡∏≠‡∏ô‡∏Å‡∏≥‡∏´‡∏ô‡∏î4‡∏ß‡∏±‡∏ô‡πÄ‡∏•‡∏¢ ‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏ï‡∏£‡∏á‡∏ï‡∏≤‡∏°‡∏õ‡∏Å ‡∏£‡∏≤‡∏Ñ‡∏≤‡∏î‡∏µ‡πÑ‡∏°‡πà‡πÅ‡∏û‡∏á ‡πÄ‡∏Ñ‡∏¢‡πÉ‡∏ä‡πâ‡πÅ‡∏•‡πâ‡∏ß‡∏ä‡∏≠‡∏ö ‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏•‡∏¥‡∏õ‡πÄ‡∏õ‡∏∑‡πâ‡∏≠‡∏ô‡πÅ‡∏°‡∏™‡πÑ‡∏î‡πâ‡∏î‡∏µ
actual: 8
predicted: 8.403383524066054

comment: ‡∏Å‡πà‡∏≠‡∏ô‡∏ã‡∏∑‡πâ‡∏≠‡∏Å‡πá‡∏≠‡∏∏‡∏ï‡∏™‡πà‡∏≤‡∏´‡πå‡∏î‡∏π‡πÉ‡∏ô‡∏£‡∏µ‡∏ß‡∏¥‡∏ß‡πÅ‡∏•‡πâ‡∏ß‡∏ô‡∏∞ ‡∏Å‡πá‡πÄ‡∏´‡πá‡∏ô‡∏ß‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏õ‡∏∏‡πà‡∏°‡πÑ‡∏ó‡∏¢‡πÅ‡∏ó‡πâ ‡πÅ‡∏ï‡πà‡∏û‡∏≠‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏™‡∏¥‡∏ô‡∏Ñ‡πâ‡∏≤‡∏°‡∏≤‡∏Å‡πá‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡∏ú‡∏¥‡∏î‡∏´‡∏ß‡∏±‡∏á‡∏°‡∏≤‡∏Å ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏ï‡∏¥‡πä‡∏Å‡πÄ‡∏Å‡∏≠‡∏£‡πå‡πÑ‡∏ó‡∏¢‡∏ã‡∏∞‡∏á‡∏±‡πâ‡∏ô ‡∏ã‡∏∂‡πà‡∏á‡∏û‡∏≠‡∏ï‡∏¥‡∏î‡∏™