In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('women_clean_data.csv')

In [4]:
df['Moves'] = df['Moves'].astype(str).replace('nan', '')
df['TimeControl'] = df['TimeControl'].fillna('standard')

In [5]:
print("\nMissing Values After Filling:")
print(df.isnull().sum())


Missing Values After Filling:
White          0
Black          0
Result         0
TimeControl    0
Moves          0
dtype: int64


In [18]:
ext_moves = []
for index, row in df.iterrows():
    moves = row['Moves'].split(' ')

    # Clean the moves to keep only valid chess moves
    valid_moves = [move for move in moves if re.match(r'^[a-h][1-8][qrbn]?$|^[KQRBNA]?x?[a-h][1-8]|O-O(-O)?$', move)]

    # Skip this row if there's not enough valid moves
    if len(valid_moves) < 2:
        continue

    for i in range(len(valid_moves) - 1):
        next_moves.append({
            'White': row['White'],
            'Black': row['Black'],
            'Result': row['Result'],
            'TimeControl': row['TimeControl'],
            'CurrentMove': valid_moves[i],
            'NextMove': valid_moves[i + 1]
        })

# Convert to DataFrame
next_moves_df = pd.DataFrame(next_moves)

In [19]:
print("\nNext Moves DataFrame:")
print(next_moves_df.head())


Next Moves DataFrame:
                     White              Black Result TimeControl CurrentMove  \
0  Goryachkina, Aleksandra  Koutlas, Nikolaos    1-0       180+2          e4   
1  Goryachkina, Aleksandra  Koutlas, Nikolaos    1-0       180+2          c5   
2  Goryachkina, Aleksandra  Koutlas, Nikolaos    1-0       180+2         Nf3   
3  Goryachkina, Aleksandra  Koutlas, Nikolaos    1-0       180+2          d6   
4  Goryachkina, Aleksandra  Koutlas, Nikolaos    1-0       180+2        Bb5+   

  NextMove  
0       c5  
1      Nf3  
2       d6  
3     Bb5+  
4      Nd7  


In [20]:
label_encoders = {}
for column in ['White', 'Black', 'Result', 'TimeControl', 'CurrentMove', 'NextMove']:
    le = LabelEncoder()
    next_moves_df[column] = le.fit_transform(next_moves_df[column])
    label_encoders[column] = le  # Store label encoder for future use

In [21]:
# Prepare features and target variable
X = next_moves_df[['White', 'Black', 'Result', 'TimeControl', 'CurrentMove']]
y = next_moves_df['NextMove']


In [22]:
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:

# Optionally, sample the training set to reduce size
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)


In [31]:
param_grid = {
    'n_estimators': [5, 10],  # Reduce the number of trees further
    'max_depth': [5],  # Limit to smaller depths
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

In [32]:
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=2,  # Test fewer parameter combinations
    cv=2,  # Keep cross-validation folds low
    n_jobs=1,  # Single core to avoid memory overuse
    verbose=2
)

In [33]:
random_search.fit(X_train_sampled, y_train_sampled)

# Display the best parameters and score
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)


Fitting 2 folds for each of 2 candidates, totalling 4 fits




[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=5; total time=   8.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=5; total time=   8.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=  14.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=  14.4s


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found: {'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5}
Best cross-validation score: 0.027805683786232445


In [35]:
# Use the best model found by RandomizedSearchCV
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))


Test Accuracy: 2.94%
