In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType
import json
import warnings

warnings.filterwarnings('ignore')



In [2]:
DATA_PATH = 'cache_trace_with_target.csv'
ONNX_MODEL_SAVE_PATH = 'model.onnx'
CONFIG_SAVE_PATH = 'model_config.json'


def load_data(filepath):
    """Load the dataset from CSV file"""
    with open(filepath, 'r') as f:
        first_line = f.readline()
    
    if first_line.startswith('Address,Set,Tag'):
        df = pd.read_csv(filepath, sep=',', skiprows=1, header=None)
    else:
        df = pd.read_csv(filepath, sep='\t', header=None)
    
    column_names = ['Address', 'Set', 'Tag', 'PC', 'Cycle', 'Type', 'ReuseDist', 'Way']
    df = df.iloc[:, :len(column_names)]
    df.columns = column_names
    
    return df

print("--- Cache Replacement Model Training ---")
print("\nLoading data...")
df = load_data(DATA_PATH)
print(f"Initial shape: {df.shape}")


--- Cache Replacement Model Training ---

Loading data...
Initial shape: (1048575, 8)


In [3]:
print("--- Dataset Before Preprocessing ---")

# Calculate middle index and get 10 rows from middle
middle_index = len(df) // 2
middle_rows = df.iloc[middle_index-5:middle_index+5]

print("\n10 rows from middle of dataset:")
display(middle_rows)

--- Dataset Before Preprocessing ---

10 rows from middle of dataset:


Unnamed: 0,Address,Set,Tag,PC,Cycle,Type,ReuseDist,Way
524282,5287688.0,10,1290.0,5287688.0,3515146.0,0.0,2373.0,1.0
524283,4808835.0,6,1174.0,4808835.0,3515187.0,0.0,122015.0,3.0
524284,4808886.0,6,1174.0,4808886.0,3515189.0,0.0,5444.0,3.0
524285,4808937.0,6,1174.0,4808937.0,3515190.0,0.0,5444.0,3.0
524286,4809009.0,6,1174.0,4809009.0,3515191.0,0.0,5444.0,3.0
524287,4808472.0,5,1173.0,4808472.0,3515192.0,0.0,160.0,1.0
524288,4581165.0,14,1118.0,4581165.0,3515194.0,0.0,160.0,0.0
524289,4581226.0,14,1118.0,4581226.0,3515196.0,0.0,160.0,0.0
524290,4581075.0,14,1118.0,4581075.0,3515198.0,0.0,225659.0,0.0
524291,4581120.0,14,1118.0,4581120.0,3515199.0,0.0,160.0,0.0


In [4]:
# Preprocess data function and execution
def preprocess_data(df):
    """Preprocess the data with essential features only"""
    # Convert to numeric and drop NA
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.dropna()
    
    # Basic filtering
    df = df[df['Address'] != 0]
    
    # Only essential engineered features
    df['Address_diff'] = df.groupby('Set')['Address'].diff().fillna(0)
    df['PC_diff'] = df.groupby('Set')['PC'].diff().fillna(0)
    df['ReuseDist_log'] = np.log1p(df['ReuseDist'].clip(lower=0))
    
    return df

print("\nPreprocessing data...")
df = preprocess_data(df)
print(f"After preprocessing: {df.shape}")

if df.empty:
    raise ValueError("No data remaining after preprocessing!")




Preprocessing data...
After preprocessing: (1012171, 11)


In [5]:
    
print("\n--- Dataset After Preprocessing ---")
middle_index = len(df) // 2
middle_rows = df.iloc[middle_index-5:middle_index+5]


print("\n10 rows from middle of dataset:")
display(middle_rows)


--- Dataset After Preprocessing ---

10 rows from middle of dataset:


Unnamed: 0,Address,Set,Tag,PC,Cycle,Type,ReuseDist,Way,Address_diff,PC_diff,ReuseDist_log
518431,4764705.0,11,1163.0,4764705.0,3376056.0,0.0,43028.0,1.0,25.0,25.0,10.66963
518432,4764594.0,11,1163.0,4764594.0,3376085.0,0.0,43056.0,1.0,-111.0,-111.0,10.67028
518433,4810408.0,6,1174.0,4810408.0,3376087.0,0.0,43056.0,3.0,8.0,8.0,10.67028
518434,4808817.0,6,1174.0,4808817.0,3376088.0,0.0,33080.0,3.0,-1591.0,-1591.0,10.406714
518435,5287552.0,10,1290.0,5287552.0,3376089.0,0.0,12186.0,1.0,524560.0,524560.0,9.408125
518436,5287586.0,10,1290.0,5287586.0,3376091.0,0.0,12186.0,1.0,34.0,34.0,9.408125
518437,5287642.0,10,1290.0,5287642.0,3376093.0,0.0,12186.0,1.0,56.0,56.0,9.408125
518438,5287902.0,10,1290.0,5287902.0,3376094.0,0.0,12186.0,1.0,260.0,260.0,9.408125
518439,5290752.0,11,1291.0,5290752.0,3376095.0,0.0,12186.0,3.0,526158.0,526158.0,9.408125
518440,5290916.0,11,1291.0,5290916.0,3376128.0,0.0,12405.0,3.0,164.0,164.0,9.425936


In [6]:
#  Filter rare classes function and execution

def filter_rare_classes(df, target_col='Way', min_samples=5):
    """Remove classes with fewer than min_samples occurrences"""
    value_counts = df[target_col].value_counts()
    valid_classes = value_counts[value_counts >= min_samples].index
    return df[df[target_col].isin(valid_classes)]

print("\nFiltering rare classes...")
df_filtered = filter_rare_classes(df)
print(f"After filtering: {df_filtered.shape}")
print("Class distribution:")
print(df_filtered['Way'].value_counts())

if df_filtered.empty:
    raise ValueError("No data remaining after filtering!")


Filtering rare classes...
After filtering: (1012165, 11)
Class distribution:
Way
0.0     530591
1.0     154488
3.0     103028
2.0      96945
4.0      31711
5.0      31612
6.0      31497
7.0      28579
8.0       1111
9.0       1071
10.0       812
11.0       707
13.0         8
12.0         5
Name: count, dtype: int64


In [7]:
# Prepare features and target
X = df_filtered.drop('Way', axis=1)
y = df_filtered['Way']

In [8]:
# Feature scaling
print("\nScaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.fillna(0).replace([np.inf, -np.inf], 0))
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


Scaling features...


In [9]:
#  Feature selection function and execution
def feature_selection(X, y, k=15):
    """Select top k features using ANOVA F-value"""
    selector = SelectKBest(score_func=f_classif, k=min(k, X.shape[1]))
    X_selected = selector.fit_transform(X, y)
    
    # Rename features to f0, f1,... for ONNX compatibility
    selected_features = [f'f{i}' for i in range(X_selected.shape[1])]
    return pd.DataFrame(X_selected, columns=selected_features), selector, X.columns[selector.get_support()]

print("\nSelecting best features...")
X_selected, selector, original_features = feature_selection(X_scaled, y)
print(f"Selected {len(original_features)} features: {original_features.tolist()}")


Selecting best features...
Selected 10 features: ['Address', 'Set', 'Tag', 'PC', 'Cycle', 'Type', 'ReuseDist', 'Address_diff', 'PC_diff', 'ReuseDist_log']


In [10]:
# Train/test split
print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


Splitting data...


In [11]:
# Train model function and execution
def train_model(X_train, y_train):
    """Train XGBoost model optimized for cache replacement"""
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=len(le.classes_),
        n_estimators=300,
        max_depth=10,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.5,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    )
    
    model.fit(X_train, y_train_encoded)
    model.label_encoder_ = le
    return model

print("\nTraining XGBoost model...")
model = train_model(X_train, y_train)


Training XGBoost model...


In [12]:
# Evaluating  model
print("\nEvaluating model...")
y_pred = model.label_encoder_.inverse_transform(model.predict(X_test))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model is: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Evaluating model...
Accuracy of the model is: 0.8863

Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94    106118
         1.0       0.93      0.90      0.91     30898
         2.0       0.88      0.83      0.86     19389
         3.0       0.89      0.84      0.87     20606
         4.0       0.72      0.58      0.65      6342
         5.0       0.74      0.59      0.66      6322
         6.0       0.75      0.59      0.66      6300
         7.0       0.71      0.57      0.63      5716
         8.0       0.67      0.53      0.59       222
         9.0       0.70      0.51      0.59       214
        10.0       0.58      0.50      0.54       162
        11.0       0.54      0.55      0.54       141
        12.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         2

    accuracy                           0.89    202433
   macro avg       0.64      0.57      0.60    202433
we

In [14]:
from sklearn.ensemble import RandomForestClassifier
# Train model function and execution - Random Forest Version
def train_rf_model(X_train, y_train):
    """Train Random Forest model optimized for cache replacement"""
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    rf_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced',  # Automatically adjusts weights for imbalanced classes
        bootstrap=True,
        oob_score=True  # Optional out-of-bag score estimation
    )
    
    rf_model.fit(X_train, y_train_encoded)
    rf_model.label_encoder_ = le
    return rf_model

print("\nTraining Random Forest model...")
rf_model = train_rf_model(X_train, y_train)

# Evaluating model
print("\nEvaluating Random Forest model...")
y_pred = rf_model.label_encoder_.inverse_transform(rf_model.predict(X_test))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model is: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))




Training Random Forest model...

Evaluating Random Forest model...
Accuracy of the model is: 0.7356

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.89    106118
         1.0       0.94      0.68      0.79     30898
         2.0       0.60      0.67      0.63     19389
         3.0       0.74      0.65      0.69     20606
         4.0       0.27      0.64      0.38      6342
         5.0       0.31      0.66      0.42      6322
         6.0       0.37      0.64      0.47      6300
         7.0       0.29      0.62      0.40      5716
         8.0       0.17      0.71      0.27       222
         9.0       0.14      0.66      0.22       214
        10.0       0.25      0.65      0.36       162
        11.0       0.26      0.75      0.38       141
        12.0       1.00      1.00      1.00         1
        13.0       0.00      0.00      0.00         2

    accuracy                           0.74    202433
   macro 

In [15]:
# Save ONNX artifacts function and execution
def save_onnx_artifacts(model, scaler, selector, original_feature_names):
    """Save model as ONNX and preprocessing info as JSON"""
    try:
        # Convert model to ONNX
        initial_type = [('float_input', FloatTensorType([None, len(original_feature_names)]))]
        onnx_model = convert_xgboost(model, initial_types=initial_type, target_opset=12)
        
        # Save ONNX model
        with open(ONNX_MODEL_SAVE_PATH, "wb") as f:
            f.write(onnx_model.SerializeToString())
        
        # Save config with feature mapping and preprocessing info
        config = {
            'feature_mapping': {f'f{i}': name for i, name in enumerate(original_feature_names)},
            'scaler_mean': scaler.mean_.tolist(),
            'scaler_scale': scaler.scale_.tolist(),
            'label_encoder_classes': model.label_encoder_.classes_.tolist()
        }
        
        with open(CONFIG_SAVE_PATH, 'w') as f:
            json.dump(config, f, indent=4)
            
        print(f"Model saved to {ONNX_MODEL_SAVE_PATH}")
        print(f"Config saved to {CONFIG_SAVE_PATH}")
        
    except Exception as e:
        print(f"Error during ONNX conversion: {e}")
        raise

print("\nSaving ONNX artifacts...")
save_onnx_artifacts(model, scaler, selector, original_features)

print("\nTraining completed successfully!")


Saving ONNX artifacts...
Model saved to model.onnx
Config saved to model_config.json

Training completed successfully!
