# Loading and Analyzing Dataset

In [2]:
import pandas as pd
df=pd.read_csv("RTL Analysis.csv")
df.head()

Unnamed: 0,num_wires,num_cells,num_memories,has_warnings,has_errors,RTL Code,num_lines,sample_id
0,317,434,0,True,False,module axi_protocol_converter_v2_1_b2s_simple_...,73,0
1,252,365,0,False,False,module altera_avalon_sc_fifo\n#(\n // -----...,895,4
2,32,8,0,False,False,module axi_protocol_converter_v2_1_r_axi3_conv...,153,5
3,34,64,0,False,False,"module e0 (x, y);\n\n\tinput [31:0] x;\n\toutp...",8,6
4,34,64,0,False,False,"module e1 (x, y);\n\n\tinput [31:0] x;\n\toutp...",8,7


In [3]:
df.describe()

Unnamed: 0,num_wires,num_cells,num_memories,num_lines,sample_id
count,177.0,177.0,177.0,177.0,177.0
mean,600.033898,783.672316,0.0,94.237288,485.090395
std,2196.051824,2816.281209,0.0,119.453967,310.660006
min,2.0,1.0,0.0,7.0,0.0
25%,8.0,3.0,0.0,22.0,155.0
50%,43.0,61.0,0.0,55.0,500.0
75%,240.0,331.0,0.0,118.0,755.0
max,19541.0,20385.0,0.0,895.0,997.0


# Feature Engineering

Addition of meaningful features like ports, assigns, always_blocks, operators etc. from the RTL Code

In [76]:
import re

def extract_rtl_features(rtl_code):
    """Extract meaningful features from RTL code."""
    return {
        'num_ports': len(re.findall(r'\binput\b', rtl_code)) + len(re.findall(r'\boutput\b', rtl_code)),
        'num_assigns': len(re.findall(r'\bassign\b', rtl_code)),
        'num_always_blocks': len(re.findall(r'\balways\b', rtl_code)),
        'num_operators': rtl_code.count('+') + rtl_code.count('*')
    }

# Apply feature extraction to RTL Code
rtl_features = df['RTL Code'].apply(extract_rtl_features).apply(pd.Series)

# Combine extracted features with existing numerical features
X = pd.concat([df[['num_wires', 'has_warnings', 'has_errors', 'num_lines']], rtl_features], axis=1)
y = df['num_cells']
X.describe()

Unnamed: 0,num_wires,num_lines,num_ports,num_assigns,num_always_blocks,num_operators
count,177.0,177.0,177.0,177.0,177.0,177.0
mean,600.033898,94.237288,8.158192,4.502825,2.389831,25.717514
std,2196.051824,119.453967,7.413735,15.41403,3.825826,133.893357
min,2.0,7.0,2.0,0.0,0.0,0.0
25%,8.0,22.0,3.0,0.0,1.0,0.0
50%,43.0,55.0,6.0,1.0,1.0,1.0
75%,240.0,118.0,10.0,4.0,3.0,4.0
max,19541.0,895.0,47.0,189.0,30.0,1253.0


# Testing with a random forest regressor

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("Random Forest MAE:", mae_rf)


Random Forest MAE: 455.197175925926


# Grid Search performed to  tune hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_


In [78]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=20, min_samples_leaf=2,min_samples_split=5)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("Random Forest MAE:", mae_rf)


Random Forest MAE: 353.997062253796


In [47]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load dataset and extract features
df = pd.read_csv("RTL Analysis.csv")
rtl_features = df['RTL Code'].apply(extract_rtl_features).apply(pd.Series)
X = pd.concat([df[['num_wires', 'num_memories', 'has_warnings', 'has_errors', 'num_lines']], rtl_features], axis=1)
y = df['num_cells']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train optimized model
best_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
best_model.fit(X_train, y_train)

# Evaluate
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Optimized MAE: {mae}")
y_pred_train=best_model.predict(X_train)
mae2=mean_absolute_error(y_pred_train,y_train)
print(mae2)

Optimized MAE: 343.75448205989363
144.1455159733894


# Performed Using XGBRegressor

In [65]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=200, learning_rate=0.1)
model.fit(X_train, y_train)
from sklearn.metrics import r2_score

y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.7059230097605793


In [66]:
mae=mean_absolute_error(y_pred,y_test)
print(mae)

289.979911448227


In [63]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

# Initialize the model
model = XGBRegressor(random_state=42)

# Set up grid search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use MAE or R²
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all CPU cores
)

# Run grid search on training data
grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}


In [64]:
# Predict on test set
y_pred = best_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Tuned MAE: {mae}")
print(f"Tuned R²: {r2}")


Tuned MAE: 291.8251991007063
Tuned R²: 0.7062976831094305


# Significant Improvement in accuracy scores

Saving the model as a joblib and testing it by loading it

In [70]:
from joblib import dump


dump(best_model, "timing_analysis_model.joblib")


['timing_analysis_model.joblib']

In [72]:
from joblib import load


loaded_model = load("timing_analysis_model.joblib")


y_pred = loaded_model.predict(X_test)
y_pred

array([7.87945604e+00, 7.87945604e+00, 1.89367266e+04, 1.12430935e+01,
       3.58062836e+02, 7.87945604e+00, 1.30532562e+02, 4.58745537e+01,
       8.96550751e+00, 1.75657715e+02, 1.62709747e+02, 8.96550751e+00,
       1.89649963e+02, 8.48980713e+01, 7.87945604e+00, 7.19091125e+02,
       4.15757141e+02, 1.00846906e+03, 8.13805618e+01, 4.70160751e+01,
       3.84978363e+02, 2.60539276e+02, 1.06296904e+04, 6.29536926e+02,
       8.96550751e+00, 7.87945604e+00, 4.86833649e+01, 3.70738037e+02,
       8.28069992e+01, 8.23051682e+01, 1.51560841e+01, 7.87945604e+00,
       1.04796623e+02, 5.24552956e+01, 8.56552277e+01, 1.90749847e+02],
      dtype=float32)