# Connect 4 — Combine Datasets & Deduplicate

Load all datasets (yours + Sebastian's + Additional + **Professor MCTS7500**), keep only **unique board positions** (majority vote for conflicting moves), and save a combined dataset ready for CNN and Transformer training.

## 1. Mount Drive & Paths

In [5]:
import os
import pickle
import numpy as np
from collections import defaultdict

BASE = os.getcwd()
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE = '/content/drive/MyDrive'
    print('Using Google Drive')
except Exception as e:
    print('Drive mount skipped, using local:', type(e).__name__)

# Output for combined dataset
OUTPUT_DIR = f'{BASE}/Connect4_Combined'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f'{OUTPUT_DIR}/datasets', exist_ok=True)

print('Drive mounted')
print(f'Output dir: {OUTPUT_DIR}')

Mounted at /content/drive
Using Google Drive
Drive mounted
Output dir: /content/drive/MyDrive/Connect4_Combined


## 2. Dataset Paths — Update if yours differ

In [6]:
# ============ UPDATE THESE PATHS IF YOUR FOLDER STRUCTURE DIFFERS ============

DATASET_PATHS = [
    # Your Fast
    f'{BASE}/Connect4_FAST/datasets/connect4_fast_5000.npz',
    f'{BASE}/Connect4_FAST/datasets/connect4_high_depth.npz',
    # Your Deep Search
    f'{BASE}/Connect4_DeepSearch/datasets/connect4_deep_search.npz',
    # Sebastian's Fast
    f'{BASE}/Connect4FastSearch_Sebastian/Connect4_FAST/datasets/connect4_fast_5000.npz',
    # Sebastian's Deep Search
    f'{BASE}/Connect4DeepSearch_Sebastian/Connect4_DeepSearch/datasets/connect4_deep_search.npz',
]

# Additional DataGenerator (best_20k) — upload to Drive or point to local path
# Option A: If uploaded to Drive under Connect4_AdditionalData or similar
ADDITIONAL_X = f'{BASE}/Connect4_AdditionalData/Additional_DataGenerator/best_20k_X.npy'
ADDITIONAL_Y = f'{BASE}/Connect4_AdditionalData/Additional_DataGenerator/best_20k_Y.npy'

# Option B: If you upload the .npy files to Colab session storage (use file picker below)
# ADDITIONAL_X = '/content/best_20k_X.npy'
# ADDITIONAL_Y = '/content/best_20k_Y.npy'

USE_ADDITIONAL = True  # Set False to skip best_20k if not available

# Professor's MCTS7500 dataset (option-a encoding: 6x7 +1/-1/0)
PROFESSOR_CANDIDATES = [f'{BASE}/mcts7500_pool.pickle', 'mcts7500_pool.pickle',
                       os.path.join(os.getcwd(), 'mcts7500_pool.pickle')]
PROFESSOR_PICKLE = next((p for p in PROFESSOR_CANDIDATES if os.path.exists(p)), PROFESSOR_CANDIDATES[0])
USE_PROFESSOR = True  # Set False to skip professor dataset

## 3. Load All Datasets

In [7]:
def load_npz(path):
    """Load X and y from npz. Handles X_train/y_train or X/y_move."""
    data = np.load(path)
    if 'X_train' in data:
        X = data['X_train']
        y = data['y_train']
    elif 'X' in data:
        X = data['X']
        y = data['y_move'] if 'y_move' in data else data['y']
    else:
        raise KeyError(f'Unknown keys: {list(data.keys())}')
    return X.astype(np.float32), y.astype(np.int64)

all_X, all_y = [], []
loaded, skipped = [], []

for p in DATASET_PATHS:
    if os.path.exists(p):
        X, y = load_npz(p)
        all_X.append(X)
        all_y.append(y)
        loaded.append((p.split('/')[-1], len(X)))
    else:
        skipped.append(p)

# Additional (best_20k)
if USE_ADDITIONAL and os.path.exists(ADDITIONAL_X) and os.path.exists(ADDITIONAL_Y):
    X_add = np.load(ADDITIONAL_X).astype(np.float32)
    y_add = np.load(ADDITIONAL_Y).astype(np.int64)
    all_X.append(X_add)
    all_y.append(y_add)
    loaded.append(('best_20k (additional)', len(X_add)))
elif USE_ADDITIONAL:
    skipped.append(f'{ADDITIONAL_X} / {ADDITIONAL_Y}')

# Professor's MCTS7500 (option-a -> option-b conversion)
if USE_PROFESSOR and os.path.exists(PROFESSOR_PICKLE):
    with open(PROFESSOR_PICKLE, 'rb') as f:
        prof = pickle.load(f)
    board_x = np.array(prof['board_x'], dtype=np.float32)
    play_y = np.array(prof['play_y'], dtype=np.int64)
    def option_a_to_b(b):
        out = np.zeros((6, 7, 2), dtype=np.float32)
        out[:, :, 0] = (b == 1).astype(np.float32)
        out[:, :, 1] = (b == -1).astype(np.float32)
        return out
    X_prof = np.array([option_a_to_b(b) for b in board_x], dtype=np.float32)
    all_X.append(X_prof)
    all_y.append(play_y)
    loaded.append(('professor_MCTS7500', len(X_prof)))
elif USE_PROFESSOR:
    skipped.append(PROFESSOR_PICKLE)

print('Loaded:')
for name, n in loaded:
    print(f'  {name}: {n:,} samples')
if skipped:
    print('\nSkipped (not found):')
    for p in skipped:
        print(f'  {p}')

  prof = pickle.load(f)


Loaded:
  connect4_fast_5000.npz: 254,230 samples
  connect4_high_depth.npz: 3,997 samples
  connect4_deep_search.npz: 59,948 samples
  connect4_fast_5000.npz: 254,228 samples
  connect4_deep_search.npz: 60,954 samples
  professor_MCTS7500: 265,620 samples

Skipped (not found):
  /content/drive/MyDrive/Connect4_AdditionalData/best_20k_X.npy / /content/drive/MyDrive/Connect4_AdditionalData/best_20k_Y.npy


## 4. Concatenate & Deduplicate (Unique Boards Only)

In [8]:
# Concatenate all
X_all = np.concatenate(all_X, axis=0)
y_all = np.concatenate(all_y, axis=0)
print(f'Total before dedup: {len(X_all):,} samples')

# Deduplicate by board (hash) — keep one per unique position, majority vote for move
from collections import Counter

board_to_moves = defaultdict(list)  # hash -> list of (X, y)

for i in range(len(X_all)):
    key = X_all[i].tobytes()
    board_to_moves[key].append((X_all[i], int(y_all[i])))
    if (i + 1) % 100000 == 0:
        print(f'  Processed {i+1:,} / {len(X_all):,}')

# For each unique board, take majority vote on move
X_unique = []
y_unique = []
conflicts = 0

for key, entries in board_to_moves.items():
    boards = [e[0] for e in entries]
    moves = [e[1] for e in entries]
    if len(set(moves)) > 1:
        conflicts += 1
    majority_move = Counter(moves).most_common(1)[0][0]
    X_unique.append(boards[0])  # same board for all entries
    y_unique.append(majority_move)

X_unique = np.array(X_unique, dtype=np.float32)
y_unique = np.array(y_unique, dtype=np.int64)

print(f'\nAfter dedup: {len(X_unique):,} unique boards')
print(f'Positions with conflicting moves (used majority): {conflicts:,}')

Total before dedup: 898,977 samples
  Processed 100,000 / 898,977
  Processed 200,000 / 898,977
  Processed 300,000 / 898,977
  Processed 400,000 / 898,977
  Processed 500,000 / 898,977
  Processed 600,000 / 898,977
  Processed 700,000 / 898,977
  Processed 800,000 / 898,977

After dedup: 680,166 unique boards
Positions with conflicting moves (used majority): 19,830


## 5. Shuffle & Train/Val/Test Split

In [None]:
SEED = 42
np.random.seed(SEED)

idx = np.random.permutation(len(X_unique))
X_unique = X_unique[idx]
y_unique = y_unique[idx]

TEST_SPLIT = 0.1
VAL_SPLIT = 0.1

n = len(X_unique)
n_test = int(n * TEST_SPLIT)
n_val = int(n * VAL_SPLIT)
n_train = n - n_test - n_val

X_test = X_unique[:n_test]
y_test = y_unique[:n_test]
X_val = X_unique[n_test:n_test+n_val]
y_val = y_unique[n_test:n_test+n_val]
X_train = X_unique[n_test+n_val:]
y_train = y_unique[n_test+n_val:]

print('Split:')
print(f'  Train: {X_train.shape[0]:,}')
print(f'  Val:   {X_val.shape[0]:,}')
print(f'  Test:  {X_test.shape[0]:,}')

## 6. Save Combined Dataset

In [None]:
DATASET_PATH = f'{OUTPUT_DIR}/datasets/connect4_combined_unique.npz'

np.savez_compressed(
    DATASET_PATH,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    X_test=X_test,
    y_test=y_test,
)

size_mb = os.path.getsize(DATASET_PATH) / (1024**2)
print(f'Saved: {DATASET_PATH}')
print(f'Size: {size_mb:.1f} MB')
print('\nKeys: X_train, y_train, X_val, y_val, X_test, y_test')

## 7. Quick Load Test (for your training notebook)

In [None]:
# Verify — use this pattern in your CNN/Transformer training notebook
data = np.load(DATASET_PATH)
X_tr = data['X_train']
y_tr = data['y_train']
X_v = data['X_val']
y_v = data['y_val']
X_te = data['X_test']
y_te = data['y_test']

print(f'X_train: {X_tr.shape} (6x7x2)')
print(f'y_train: {y_tr.shape} (column 0-6)')
print(f'Move distribution: {np.bincount(y_tr.astype(int), minlength=7)}')
print('\nReady for CNN and Transformer training!')

## 8. Upload best_20k if not on Drive (optional)

In [None]:
# Run this cell only if you need to upload best_20k_X.npy and best_20k_Y.npy from your computer.
# Then set USE_ADDITIONAL=True and ADDITIONAL_X='/content/best_20k_X.npy', ADDITIONAL_Y='/content/best_20k_Y.npy'
# and re-run from cell 3.

from google.colab import files

print('Upload best_20k_X.npy and best_20k_Y.npy (one at a time)')
uploaded = files.upload()  # Will prompt for files
# Files save to /content/ — then set ADDITIONAL_X = '/content/best_20k_X.npy', etc.