In [12]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# import data
df_train = pd.read_csv("./data/training.csv")
df_test = pd.read_csv("./data/testing.csv")


In [27]:
df_train

Unnamed: 0.1,Unnamed: 0,horse_num,race_id,horse_id,jockey_code,owner_code,trainer_code,horseweight,carry_weight,SexCD,...,owner_avg_rank,trainer_rate_top3,trainer_avg_rank,distance(m),racetrack_type,weather_type,turf_condition,dirt_condition,speed,top3
0,0,1,200801050610101,2005107051,1031,898009,437,440.0,540,2,...,9.059701,0.104493,9.619384,1200.0,24.0,2.0,0.0,1.0,1.041667,0
1,1,2,200801050610101,2005102028,733,498800,138,440.0,540,2,...,7.726708,0.209557,7.601792,1200.0,24.0,2.0,0.0,1.0,1.053556,1
2,2,3,200801050610101,2005104156,684,486800,1054,462.0,540,2,...,6.736876,0.256176,7.136932,1200.0,24.0,2.0,0.0,1.0,1.050788,1
3,3,4,200801050610101,2005105319,672,515800,1081,392.0,540,2,...,7.361809,0.208280,7.884763,1200.0,24.0,2.0,0.0,1.0,1.044386,0
4,4,5,200801050610101,2005101044,663,163002,1088,484.0,540,2,...,8.377990,0.222673,7.484941,1200.0,24.0,2.0,0.0,1.0,1.053556,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495964,500256,11,201712280950912,2012101892,1126,7803,411,482.0,570,1,...,6.287449,0.259989,6.730879,1400.0,17.0,2.0,1.0,0.0,1.154163,1
495965,500257,12,201712280950912,2013104774,1032,265008,1078,452.0,550,2,...,7.650888,0.202165,7.850908,1400.0,17.0,2.0,1.0,0.0,1.143791,0
495966,500258,13,201712280950912,2012101704,732,897009,1073,504.0,570,1,...,7.687500,0.263523,6.807906,1400.0,17.0,2.0,1.0,0.0,1.148482,0
495967,500259,14,201712280950912,2012102404,1015,12033,1089,480.0,550,2,...,9.250000,0.171372,8.453053,1400.0,17.0,2.0,1.0,0.0,1.140994,0


In [3]:
cols = ['horseweight', 'carry_weight', 'SexCD',
        'horse_age', 'horse_type', 'jokey_rate_top3',
        'jokey_avg_rank', 'owner_rate_top3', 'owner_avg_rank',
        'trainer_rate_top3', 'trainer_avg_rank', 'distance(m)',
        'racetrack_type', 'weather_type', 'turf_condition', 'dirt_condition']
X_train = df_train[cols]
y_train = df_train["speed"]
X_test = df_test[cols]
y_test = df_test["speed"]

In [6]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [7]:
print(X_train.shape)

(495969, 16)


In [21]:
from sklearn.feature_selection import SelectKBest, f_regression

# SelectKBest với f_regression để chọn 10 đặc trưng tốt nhất
selector = SelectKBest(score_func=f_regression, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.fit_transform(X_test, y_test)

In [23]:
X_train_selected.shape

(495969, 10)

In [22]:


# Create a Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = model.predict(X_test_selected)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.009364108816773431
R-squared: 0.06294317712118636


In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
# Tạo pipeline: Chuẩn hóa dữ liệu, chọn đặc trưng và mô hình hồi quy tuyến tính
pipeline = make_pipeline(
    StandardScaler(),  # Chuẩn hóa dữ liệu
    SelectKBest(score_func=f_regression),  # Chọn đặc trưng tốt nhất
    LinearRegression()  # Mô hình hồi quy tuyến tính
)

# Huấn luyện mô hình với dữ liệu huấn luyện
pipeline.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = pipeline.predict(X_test)

# Đánh giá mô hình
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.008595047133499146
R-squared: 0.13990239573214114


In [31]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
# Tạo pipeline cho mô hình Ridge Regression
pipeline = make_pipeline(StandardScaler(), Ridge())

# Danh sách các siêu tham số để thử nghiệm
param_grid = {
    'ridge__alpha': [0.1, 1, 10, 100]  # Các giá trị alpha để thử
}

# GridSearchCV để tìm siêu tham số tốt nhất
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Lấy mô hình tốt nhất từ GridSearchCV
best_model = grid_search.best_estimator_

# Dự đoán trên tập test
y_pred = best_model.predict(X_test)

# Đánh giá mô hình
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Best Parameters: {grid_search.best_params_}')

Mean Squared Error: 0.00859255319274255
R-squared: 0.14015196184115908
Best Parameters: {'ridge__alpha': 100}


In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Xây dựng mạng nơ-ron
model = Sequential()

# Thêm các layer vào mô hình
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))  # Số đơn vị ở layer cuối cùng phải là 1 vì chúng ta dự đoán một giá trị duy nhất

# Biên dịch mô hình
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])  # Chọn metrics để theo dõi hiệu suất

# Huấn luyện mô hình
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)  # Validation split để kiểm tra hiệu suất trên tập validation

# Đánh giá mô hình trên tập kiểm tra
loss, mae = model.evaluate(X_test, y_test)
print(f"Mean Squared Error: {loss}")
print(f"Mean Absolute Error: {mae}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error: 0.0023227042984217405
Mean Absolute Error: 0.02063528448343277


In [26]:
loss, mae = model.evaluate(X_train, y_train)
print(f"Mean Squared Error: {loss}")
print(f"Mean Absolute Error: {mae}")

Mean Squared Error: 0.0032388726249337196
Mean Absolute Error: 0.022003285586833954
