In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import tensorflow as tf

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [5]:
df_train = pd.read_csv('/content/drive/MyDrive/Machine Learning/Scrabble Player Rating/train.csv')
df_turn = pd.read_csv('/content/drive/MyDrive/Machine Learning/Scrabble Player Rating/turns.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Machine Learning/Scrabble Player Rating/test.csv')
df_games = pd.read_csv('/content/drive/MyDrive/Machine Learning/Scrabble Player Rating/games.csv')

# Table data train

In [6]:
df_train.head()

Unnamed: 0,game_id,nickname,score,rating
0,1,BetterBot,335,1637
1,1,stevy,429,1500
2,3,davidavid,440,1811
3,3,BetterBot,318,2071
4,4,Inandoutworker,119,1473


# Table data test

In [7]:
df_test.head()

Unnamed: 0,game_id,nickname,score,rating
0,2,Super,488,
1,2,BetterBot,401,2000.0
2,7,STEEBot,377,2082.0
3,7,Priya1,379,
4,11,STEEBot,334,1829.0


# Table data turns


In [8]:
df_turn.head()

Unnamed: 0,game_id,turn_number,nickname,rack,location,move,points,score,turn_type
0,1,1,BetterBot,DDEGITT,8G,DIG,10,10,Play
1,1,2,stevy,AEHOPUX,7H,HAP,18,18,Play
2,1,3,BetterBot,DEELTTU,6I,LUTE,16,26,Play
3,1,4,stevy,EMORSUX,5K,UM,16,34,Play
4,1,5,BetterBot,ACDEITU,L5,..DICATE,28,54,Play


# Table data games

In [9]:
df_games.head()

Unnamed: 0,game_id,first,time_control_name,game_end_reason,winner,created_at,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds
0,1,BetterBot,regular,STANDARD,1,2022-08-26 03:38:49,NWL20,1200,0,CASUAL,1,674.844274
1,2,Super,regular,STANDARD,1,2022-08-10 19:19:59,CSW21,3600,0,RATED,1,364.214418
2,3,BetterBot,regular,STANDARD,1,2022-09-04 08:04:27,CSW21,900,0,RATED,5,492.268262
3,4,BetterBot,regular,RESIGNED,0,2022-09-12 02:36:19,CSW21,3600,0,CASUAL,1,350.861141
4,5,STEEBot,regular,STANDARD,0,2022-09-06 04:31:36,NWL20,1200,0,CASUAL,1,642.688722


In [10]:
def des_ana(dataframe):
    print('========================Shape==========================')
    print(f'{dataframe.shape}\n\n')
    print('========================Columns==========================')
    print(f'{dataframe.columns}\n\n')
    print('========================Info==========================')
    print(f'{dataframe.info()}\n\n')
    print('========================Descritive==========================')
    print(f'{dataframe.describe()}\n\n')
    print('========================Null Values==========================')
    print(f'{dataframe.isnull().sum()}\n\n')
    print('========================Unique Values==========================')
    print(f'{dataframe.nunique()}\n\n')

In [11]:
# "TRAIN"
des_ana(df_train)

(100820, 4)


Index(['game_id', 'nickname', 'score', 'rating'], dtype='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100820 entries, 0 to 100819
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   game_id   100820 non-null  int64 
 1   nickname  100820 non-null  object
 2   score     100820 non-null  int64 
 3   rating    100820 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 3.1+ MB
None


             game_id          score         rating
count  100820.000000  100820.000000  100820.000000
mean    36367.328605     399.029022    1875.592938
std     21020.288705      75.184778     231.791016
min         1.000000     -64.000000    1033.000000
25%     18139.000000     352.000000    1664.000000
50%     36339.500000     398.000000    1907.000000
75%     54625.000000     445.000000    2060.000000
max     72773.000000    1132.000000    2510.000000


game_id     0
nickname    0
score       0
rating      0


In [12]:
# "TEST"
des_ana(df_test)

(44726, 4)


Index(['game_id', 'nickname', 'score', 'rating'], dtype='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44726 entries, 0 to 44725
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   game_id   44726 non-null  int64  
 1   nickname  44726 non-null  object 
 2   score     44726 non-null  int64  
 3   rating    22363 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 1.4+ MB
None


            game_id         score        rating
count  44726.000000  44726.000000  22363.000000
mean   36431.342664    401.016791   2002.547780
std    20979.878622     79.889084    163.343333
min        2.000000      0.000000   1407.000000
25%    18347.500000    352.000000   1891.000000
50%    36474.000000    401.000000   2006.000000
75%    54477.750000    450.000000   2127.000000
max    72769.000000   1171.000000   2377.000000


game_id         0
nickname        0
score           0
rating      22363
dt

In [13]:
# "GAMES"
des_ana(df_games)

(72773, 12)


Index(['game_id', 'first', 'time_control_name', 'game_end_reason', 'winner',
       'created_at', 'lexicon', 'initial_time_seconds', 'increment_seconds',
       'rating_mode', 'max_overtime_minutes', 'game_duration_seconds'],
      dtype='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72773 entries, 0 to 72772
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   game_id                72773 non-null  int64  
 1   first                  72773 non-null  object 
 2   time_control_name      72773 non-null  object 
 3   game_end_reason        72773 non-null  object 
 4   winner                 72773 non-null  int64  
 5   created_at             72773 non-null  object 
 6   lexicon                72773 non-null  object 
 7   initial_time_seconds   72773 non-null  int64  
 8   increment_seconds      72773 non-null  int64  
 9   rating_mode            72773 non-null  object 
 10

In [14]:
df_games.drop('created_at',axis=1, inplace=True)

In [15]:
df_games['winner'].value_counts()

 0    46713
 1    25792
-1      268
Name: winner, dtype: int64

In [16]:
df_train['game_id']

0             1
1             1
2             3
3             3
4             4
          ...  
100815    72771
100816    72772
100817    72772
100818    72773
100819    72773
Name: game_id, Length: 100820, dtype: int64

In [17]:
# winner:
# 1 -> player win
# 0 -> player lose
# -1 -> draw

In [18]:
len(df_train['game_id'])

100820

In [19]:
df_train['game_id']

0             1
1             1
2             3
3             3
4             4
          ...  
100815    72771
100816    72772
100817    72772
100818    72773
100819    72773
Name: game_id, Length: 100820, dtype: int64

In [20]:
df_train.tail(10)

Unnamed: 0,game_id,nickname,score,rating
100810,72767,HAYDEN,340,1395
100811,72767,BetterBot,414,1952
100812,72770,HastyBot,590,2237
100813,72770,samsiah06,97,1332
100814,72771,BB-8,390,1500
100815,72771,HastyBot,393,1614
100816,72772,BetterBot,442,1674
100817,72772,Gtowngrad,388,1364
100818,72773,adola,383,2075
100819,72773,HastyBot,346,2302


In [21]:
for i in range(0,len(df_train['game_id']),2):
    print(df_train['game_id'][i+1])

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
65546
65547
65548
65549
65551
65552
65553
65554
65557
65559
65560
65563
65564
65566
65567
65570
65571
65572
65573
65574
65575
65576
65580
65581
65582
65584
65586
65587
65588
65591
65593
65595
65596
65597
65600
65601
65603
65604
65605
65606
65607
65608
65609
65610
65611
65612
65613
65614
65615
65616
65617
65620
65622
65623
65625
65627
65628
65629
65630
65631
65632
65634
65635
65636
65637
65638
65639
65640
65642
65643
65645
65646
65647
65648
65650
65652
65653
65654
65655
65656
65657
65659
65664
65667
65668
65669
65671
65673
65674
65676
65678
65679
65680
65683
65686
65687
65688
65689
65690
65692
65693
65695
65699
65700
65701
65702
65703
65707
65709
65710
65711
65713
65714
65716
65721
65722
65723
65724
65725
65727
65728
65733
65734
65735
65736
65737
65738
65741
65742
65743
65744
65745
65746
65748
65749
65751
65752
65753
65754
65758
65759
65760
65761
65763
65764
65765
65769
65771
65773
65775
65776
65777
65779
65781
65782

In [22]:
# Sign the winner-loser or draw

def sign_winner(data):
    arr = []
    for i in range(0,len(data['game_id']),2):
        if (data['score'][i] > data['score'][i+1]):
            a=1
            b=0
            arr.append(a)
            arr.append(b)
        elif (data['score'][i] < data['score'][i+1]):
            a=0
            b=1
            arr.append(a)
            arr.append(b)
        elif (data['score'][i] == data['score'][i+1]):
            a=b=-1
            arr.append(a)
            arr.append(b)
    return arr

In [23]:
# This function will take columns have conformity rows then give into data train and data test
def conform_column(df_games, data):

    # combine columns in 'df_games' to data train and test.
    cols = ['game_id','time_control_name', 'game_end_reason','lexicon','initial_time_seconds','increment_seconds','rating_mode','max_overtime_minutes','game_duration_seconds']
    new_data = pd.merge(data,df_games[cols],on='game_id')
    new_data = new_data.assign(winner=sign_winner(data))

    # move column 'rating' to the end. Is just for easy to look
    temp = new_data['rating']
    new_data.drop('rating',axis=1,inplace=True)
    new_data = new_data.assign(rating=temp)
    return new_data

In [24]:
df_train = conform_column(df_games, df_train)
df_test = conform_column(df_games, df_test)

In [25]:
df_train.shape

(100820, 13)

In [26]:
df_test.head()

Unnamed: 0,game_id,nickname,score,time_control_name,game_end_reason,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,winner,rating
0,2,Super,488,regular,STANDARD,CSW21,3600,0,RATED,1,364.214418,1,
1,2,BetterBot,401,regular,STANDARD,CSW21,3600,0,RATED,1,364.214418,0,2000.0
2,7,STEEBot,377,regular,STANDARD,CSW21,1260,0,RATED,1,385.599607,0,2082.0
3,7,Priya1,379,regular,STANDARD,CSW21,1260,0,RATED,1,385.599607,1,
4,11,STEEBot,334,regular,STANDARD,NWL20,1200,0,CASUAL,1,501.739156,0,1829.0


In [27]:
# Player hasn't rating. just only bot has
df_player_test = df_test[df_test['rating'].isnull()]
df_bot_test = df_test[df_test['rating'].notnull()]

In [28]:
df_train = df_train.append(df_bot_test)

  df_train = df_train.append(df_bot_test)


In [29]:
df_test = df_player_test

In [30]:
df_train.drop(['game_id', 'nickname'], axis=1,inplace=True)
df_test.drop(['game_id', 'nickname', 'rating'], axis=1,inplace=True)

In [31]:
df_train.head()

Unnamed: 0,score,time_control_name,game_end_reason,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,winner,rating
0,335,regular,STANDARD,NWL20,1200,0,CASUAL,1,674.844274,0,1637.0
1,429,regular,STANDARD,NWL20,1200,0,CASUAL,1,674.844274,1,1500.0
2,440,regular,STANDARD,CSW21,900,0,RATED,5,492.268262,1,1811.0
3,318,regular,STANDARD,CSW21,900,0,RATED,5,492.268262,0,2071.0
4,119,regular,RESIGNED,CSW21,3600,0,CASUAL,1,350.861141,0,1473.0


In [32]:
le = LabelEncoder()

cat_cols = ['time_control_name','game_end_reason','lexicon','rating_mode','winner']

for col in cat_cols:
    df_train[col] = le.fit_transform(df_train[col])
    df_train[col] = df_train[col].astype('category')
    df_test[col] = le.fit_transform(df_test[col])
    df_test[col] = df_test[col].astype('category')

In [33]:
df_train.head()

Unnamed: 0,score,time_control_name,game_end_reason,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,winner,rating
0,335,2,2,3,1200,0,0,1,674.844274,1,1637.0
1,429,2,2,3,1200,0,0,1,674.844274,2,1500.0
2,440,2,2,0,900,0,1,5,492.268262,2,1811.0
3,318,2,2,0,900,0,1,5,492.268262,1,2071.0
4,119,2,1,0,3600,0,0,1,350.861141,1,1473.0


In [34]:
# num_cols = ['score', 'initial_time_seconds','increment_seconds','max_overtime_minutes','game_duration_seconds','rating']

# ct = ColumnTransformer(
#     [
#         ('scale', StandardScaler(), ['score', 'initial_time_seconds','increment_seconds','max_overtime_minutes','game_duration_seconds','rating']),
#     ],remainder='passthrough'
# )

# df_train[num_cols] = ct.fit_transform(df_train[num_cols])

In [35]:
# train, valid, _ = np.split(df_train.sample(frac=1), [int(0.7*len(df_train)), int(0.95*len(df_train))])

In [36]:
# train

In [37]:
scale = StandardScaler()

num_cols_train = ['score', 'initial_time_seconds','increment_seconds','max_overtime_minutes','game_duration_seconds','rating']
num_cols_test = ['score', 'initial_time_seconds','increment_seconds','max_overtime_minutes','game_duration_seconds']

# train[num_cols_train] = scale.fit_transform(train[num_cols_train])
# valid[num_cols_train] = scale.transform(valid[num_cols_train])
df_train[num_cols_train] = scale.fit_transform(df_train[num_cols_train])
#df_test[num_cols_test] = scale.fit_transform(df_test[num_cols_test])

In [38]:
X = df_train.iloc[:,:-1].values
y = df_train.iloc[:,-1].values
# X_train = train.iloc[:,:-1].values
# y_train = train.iloc[:,-1].values
# X_valid = valid.iloc[:,:-1].values
# y_valid = valid.iloc[:,-1].values

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [40]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)


In [41]:
ln = LinearRegression()
ln.fit(X_train, y_train)
y_pred = ln.predict(X_test)

In [42]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [43]:
df_scale_test = np.hstack((X_test, np.reshape(y_test, (-1,1))))
df_scale_pred = np.hstack((X_test, np.reshape(y_pred, (-1,1))))

In [44]:
df_scale_test = np.delete(df_scale_test, [1,2,3,6,9],axis=1)

In [45]:
df_scale_pred = np.delete(df_scale_pred, [1,2,3,6,9],axis=1)

In [46]:
df_scale_test

array([[ 0.22196224, -1.60650597, -0.09038784, -0.40867287, -1.12592301,
         0.85002285],
       [ 0.14260441, -0.42868459, -0.09038784,  2.44794549, -0.91740801,
        -1.76156638],
       [-0.35999522, -0.42868459, -0.09038784, -0.40867287,  0.04073494,
         0.2888184 ],
       ...,
       [-0.3335426 , -0.42868459, -0.09038784,  2.44794549, -0.49306564,
        -1.76156638],
       [ 0.89650384, -0.0080341 , -0.09038784, -0.40867287,  0.28997592,
         0.39929171],
       [ 0.10292549, -0.0080341 , -0.09038784, -0.40867287,  1.04697792,
        -0.82033371]])

In [47]:
df_scale_test = scale.inverse_transform(df_scale_test)
df_scale_pred = scale.inverse_transform(df_scale_pred)

In [53]:
df_scale_test[:,-1]

array([2091., 1500., 1964., ..., 1500., 1989., 1713.])

In [49]:
df_scale_pred[:,-1]

array([2081.73, 1512.04, 1994.8 , ..., 1586.9 , 2092.62, 1753.35])

In [54]:
print(mean_squared_error(df_scale_test[:,-1], df_scale_pred[:,-1]))

17211.31815953269


In [51]:
print(mean_squared_error(df_scale_test[:,-1], df_scale_pred[:,-1]))

17211.31815953269


In [52]:
print(mean_squared_error(df_scale_test[:,-1], df_scale_pred[:,-1]))

17211.31815953269


In [57]:
X_train.reshape(-1).shape

(862280,)

In [58]:
X_train.shape

(86228, 10)

In [59]:
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='relu')
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),loss='mean_squared_error')

In [None]:
history = nn_model.fit(
    X_train, y_train,
    validation_data = (X_test, y_test),
    verbose=0, epochs=10
)

In [None]:
plot_loss(history)

NameError: ignored