In [None]:
# --- RadioML 2016.10A: Baseline ML models (LogReg + Decision Tree) ---

import pickle, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1) Load the dataset dict {(mod, snr): np.array[num_samples, 2, 128]}
with open("RML2016.10a_dict.pkl", "rb") as f:
    data_dict = pickle.load(f, encoding="latin1")  # latin1 handles older pickles

# 2) Build X (features) and y (labels)
X_list, y_list = [], []
for (mod, snr), arr in data_dict.items():
    # arr shape: (N, 2, 128) -> concatenate I and Q -> (N, 256)
    arr = np.asarray(arr)
    feats = arr.reshape(arr.shape[0], -1)  # flatten [2,128] -> 256
    X_list.append(feats)
    y_list.extend([mod] * arr.shape[0])

X = np.vstack(X_list).astype(np.float32)
y = np.array(y_list)

print("X shape:", X.shape, "| y shape:", y.shape, "| classes:", np.unique(y))

# (Optional) Subsample to keep things light while you test
# comment this block out later if you want the full set
MAX_SAMPLES = 60000  # change as needed for RAM; 0/None for all
if MAX_SAMPLES and len(y) > MAX_SAMPLES:
    rng = np.random.default_rng(42)
    idx = rng.choice(len(y), size=MAX_SAMPLES, replace=False)
    X, y = X[idx], y[idx]
    print(f"Subsampled to: {X.shape[0]} examples")

# 3) Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 4) Train/val split (stratified keeps class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# 5) Models
logreg = make_pipeline(
    StandardScaler(with_mean=True),  # helps linear models
    LogisticRegression(max_iter=2000, n_jobs=-1)
)

tree = DecisionTreeClassifier(random_state=42)

# 6) Fit + evaluate
def run_model(name, model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n{name} Accuracy: {acc:.3f}")
    print(classification_report(y_test, preds, target_names=le.classes_, digits=3))

run_model("Logistic Regression", logreg)
run_model("Decision Tree", tree)



X shape: (220000, 256) | y shape: (220000,) | classes: ['8PSK' 'AM-DSB' 'AM-SSB' 'BPSK' 'CPFSK' 'GFSK' 'PAM4' 'QAM16' 'QAM64'
 'QPSK' 'WBFM']
Subsampled to: 60000 examples

Logistic Regression Accuracy: 0.151
              precision    recall  f1-score   support

        8PSK      0.099     0.116     0.107      1108
      AM-DSB      0.332     0.416     0.369      1080
      AM-SSB      0.160     0.198     0.177      1102
        BPSK      0.095     0.078     0.086      1074
       CPFSK      0.115     0.109     0.112      1094
        GFSK      0.100     0.071     0.083      1092
        PAM4      0.118     0.129     0.123      1105
       QAM16      0.116     0.096     0.105      1079
       QAM64      0.113     0.096     0.104      1088
        QPSK      0.114     0.111     0.113      1093
        WBFM      0.218     0.239     0.228      1085

    accuracy                          0.151     12000
   macro avg      0.144     0.151     0.146     12000
weighted avg      0.143     0.151