In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [2]:
# Read Excel file
df = pd.read_excel(r"C:\Users\HP\Project\SRBCT.xlsx")

In [3]:
df.shape

(85, 2309)

In [4]:
df

Unnamed: 0,21652,25725,26184,22260,22293,22493,23019,23132,24145,25584,...,504207,810448,810402,782503,795277,809383,809815,810483,503033,class
0,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,...,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,EWS BL NB RMS
1,,,,,,,,,,,...,,,,,,,,,,class
2,3.2025,0.0681,1.046,0.1243,0.4941,3.1207,3.7106,1.8416,1.2607,2.9001,...,1.6679,0.1493,0.6918,1.4151,0.2756,0.1521,0.3175,0.724,0.2044,EWS
3,1.6547,0.071,1.0409,0.052,0.2045,2.1609,2.4452,1.1473,0.7371,1.9989,...,3.6014,0.3048,1.7957,1.0701,0.2688,0.1932,0.414,1.2708,0.299,EWS
4,3.2779,0.116,0.8926,0.1014,0.2818,1.9773,3.259,1.4106,0.9548,2.0775,...,1.5152,0.2382,0.872,0.6819,0.3221,0.2156,0.3227,1.2142,0.223,EWS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,0.6403,0.2515,1.3489,0.0646,0.2302,2.6957,7.8286,1.3432,2.1532,1.5941,...,0.1715,0.687,0.2249,0.6267,0.3742,0.3711,1.2332,0.8234,0.3209,NB
81,0.6729,0.3038,2.0071,0.1553,0.2691,2.1064,5.8282,2.8124,2.2321,2.5947,...,0.3845,0.645,0.4294,0.6889,1.0609,0.2984,0.8024,1.5715,0.3265,NB
82,0.8249,0.3454,1.2253,0.1277,0.2764,1.9422,10.579,1.3557,2.6605,0.9183,...,0.1166,0.8923,0.1361,0.5169,0.4861,0.1485,1.1742,0.7052,0.2799,NB
83,0.1181,0.1068,0.3881,0.0622,0.1616,0.6343,12.7753,0.7631,4.2117,1.7225,...,0.5223,0.3137,0.4451,0.0501,0.0879,0.5548,0.4537,0.5848,0.1444,BL


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Columns: 2309 entries, 21652 to class
dtypes: object(2309)
memory usage: 1.5+ MB


# 1️ Load & clean dataset

In [6]:
df = pd.read_excel(r"C:\Users\HP\Project\SRBCT.xlsx", sheet_name='in')
df_clean = df.iloc[2:].reset_index(drop=True)  # remove first 2 rows

In [7]:
# Ensure all column names are strings to avoid KeyError later
df_clean.columns = df_clean.columns.astype(str)

# 2️ Separate features and target

In [8]:
X = df_clean.drop(columns=['class']).astype(float)
X.columns = X.columns.astype(str)  # ensure column names are strings
y = df_clean['class']

# 3️ Scale all features for autoencoder

In [9]:
scaler_all = StandardScaler()
X_scaled_all = scaler_all.fit_transform(X)

# 4️ Folded Autoencoder for feature selection

In [10]:
hidden_dim = 5
autoencoder = MLPRegressor(hidden_layer_sizes=(hidden_dim,),
                           max_iter=100,
                           random_state=0)
autoencoder.fit(X_scaled_all, X_scaled_all)



# 5️ Compute feature importance

In [11]:
coefs = autoencoder.coefs_[0]  # weights from input layer to hidden layer
feature_importance = np.sum(np.abs(coefs), axis=1)

# 6️ Select top 20 features

In [12]:
k = 20
top_features_idx = np.argsort(feature_importance)[::-1][:k]
top_features = X.columns[top_features_idx]

print("Top 20 features:")
print(top_features)

Top 20 features:
Index(['627273', '824352', '450854', '727251', '509570', '269354-2', '376516',
       '771323', '813648', '210405', '213607', '782503', '234376', '757404',
       '45632', '882548', '814526', '26418', '789049', '276091'],
      dtype='object')


# 7️ Shuffle & normalize only top 20 features

In [13]:
df_selected = df_clean[top_features.tolist() + ['class']]
df_shuffled = df_selected.sample(frac=1, random_state=42).reset_index(drop=True)

X_top = df_shuffled[top_features].astype(float)
y_top = df_shuffled['class']

scaler_top = StandardScaler()
X_top_scaled = scaler_top.fit_transform(X_top)

# 8️ Optional: combine scaled features with target

In [14]:
df_final = pd.DataFrame(X_top_scaled, columns=top_features)
df_final['class'] = y_top.values

# -----------------------------
# Now ready for modeling
# X_top_scaled -> normalized top 20 features
# y_top -> target labels
# df_final -> shuffled + normalized dataframe

In [15]:
df_final

Unnamed: 0,627273,824352,450854,727251,509570,269354-2,376516,771323,813648,210405,...,782503,234376,757404,45632,882548,814526,26418,789049,276091,class
0,1.419337,0.560146,0.281976,-0.813901,-0.804941,-0.414494,-0.059014,-0.932691,1.096315,0.830259,...,-1.157171,-0.345998,0.178499,-0.450867,-0.985358,1.764010,-0.207523,0.773319,0.333876,BL
1,-0.126236,0.043607,-0.268236,0.196822,-0.171370,-0.679217,-0.719028,-0.340796,0.253241,0.271755,...,0.873204,1.238507,0.099101,0.702499,0.058856,-0.421290,0.619305,0.040959,-0.372655,EWS
2,-1.181128,-1.235751,-1.205742,-0.729253,-1.255163,-0.332949,-0.493817,-0.202473,-1.624020,-1.051184,...,-0.450986,-0.323018,-0.594391,0.039022,-0.035659,0.839630,-0.606795,-1.196402,-0.358515,EWS
3,-0.587069,0.155411,-0.399846,-0.824549,0.271577,0.379672,0.896928,-0.625306,1.227194,-1.303053,...,-0.069224,0.102156,0.326378,-0.530041,-0.589578,-0.586674,-0.670583,0.161283,-1.380036,NB
4,-1.079934,0.007382,-0.446809,-0.867405,0.042077,-0.669174,-0.301549,0.319009,-0.027562,-1.190889,...,0.869761,-0.569169,0.371287,0.072869,0.170414,-0.179255,-0.572194,-0.364191,-0.049864,EWS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,-0.585613,0.050644,0.035859,-0.927431,0.627491,-0.988126,-0.353237,-0.862993,1.446433,-1.372327,...,-0.303671,-0.249163,3.440762,-0.476004,-1.185912,-0.845222,-0.795449,1.410497,-0.694471,EWS
79,0.306932,-0.651192,-0.627990,-0.115951,1.217670,-0.425742,-0.571916,0.352964,0.274501,-0.622819,...,-1.060458,-0.006842,1.758022,0.645692,-0.977756,-0.399141,-0.376018,0.240661,0.266100,RMS
80,0.589402,-0.721819,1.118890,0.169271,-0.834538,-0.651499,0.229530,0.265991,-0.927992,0.119225,...,2.385432,4.340286,-0.998576,1.337474,-0.407908,-0.284231,0.580792,-0.664462,2.664115,EWS
81,0.242867,1.062091,0.988150,-0.379879,-1.105172,-0.806155,-0.292745,-0.807235,-0.121015,-0.278129,...,-0.485869,-0.269606,-0.018259,0.539401,-0.316928,-0.429882,-0.711202,-0.294870,-0.371680,EWS


# 1️ Train-test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X_top_scaled, y_top, test_size=0.2, stratify=y_top, random_state=42
)

# 2️ Define Random Forest model

In [17]:
rf_model = RandomForestClassifier(
    n_estimators=100,       # number of trees
    max_depth=None,         # grow trees fully
    random_state=42
)

# 3️ Train the model

In [18]:
rf_model.fit(X_train, y_train)

# 4️ Make predictions

In [19]:
y_pred = rf_model.predict(X_test)

# 5️ Evaluate the model

In [20]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.10f}")
# print("Confusion Matrix:")
# print(conf_matrix)
# print("Classification Report:")
# print(class_report)

Accuracy: 0.9411764706


# Performance Evaluation

In [21]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# -----------------------------
# Confusion Matrix
# -----------------------------
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# -----------------------------
# Precision and Recall (per class)
# -----------------------------
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)

print("\nPrecision per class:", precision)
print("Recall (Sensitivity) per class:", recall)

# -----------------------------
# Specificity and Negative Predictive Value (NPV)
# -----------------------------
specificity = []
npv = []
n_classes = len(np.unique(y_test))

for i in range(n_classes):
    TP = cm[i, i]
    FP = cm[:, i].sum() - TP
    FN = cm[i, :].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    
    spec = TN / (TN + FP) if (TN + FP) != 0 else 0
    npv_val = TN / (TN + FN) if (TN + FN) != 0 else 0
    
    specificity.append(spec)
    npv.append(npv_val)

print("Specificity per class:", specificity)
print("Negative Predictive Value (NPV) per class:", npv)

Confusion Matrix:
 [[2 0 0 0]
 [0 6 0 0]
 [0 1 3 0]
 [0 0 0 5]]

Precision per class: [1.         0.85714286 1.         1.        ]
Recall (Sensitivity) per class: [1.   1.   0.75 1.  ]
Specificity per class: [np.float64(1.0), np.float64(0.9090909090909091), np.float64(1.0), np.float64(1.0)]
Negative Predictive Value (NPV) per class: [np.float64(1.0), np.float64(1.0), np.float64(0.9285714285714286), np.float64(1.0)]


# 1. Fix randomness

In [27]:
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)

# 2. Wrap preprocessing + model in a Pipeline before GridSearchCV

In [23]:
pipe = Pipeline([
    ('scaler', StandardScaler()),   # if needed
    ('rf', RandomForestClassifier())
])

param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20],
    'rf__max_features': ['sqrt', 'log2']
}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [24]:
print("Best params:", grid.best_params_)
print("CV accuracy:", grid.best_score_)

test_acc = grid.score(X_test, y_test)
print("Test accuracy:", test_acc)

Best params: {'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
CV accuracy: 1.0
Test accuracy: 1.0


In [25]:
# Create a pandas DataFrame from the cv_results_ dictionary
results_df = pd.DataFrame(grid.cv_results_)

# Sort the DataFrame by the mean test score (accuracy in this case) in descending order
results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the top results
# 'params' shows the hyperparameter combination
# 'mean_test_score' shows the average cross-validated accuracy
# 'rank_test_score' shows the ranking of each result
print(results_df[['params', 'mean_test_score', 'rank_test_score']])

                                               params  mean_test_score  \
0   {'rf__max_depth': None, 'rf__max_features': 's...         1.000000   
2   {'rf__max_depth': None, 'rf__max_features': 's...         1.000000   
12  {'rf__max_depth': 20, 'rf__max_features': 'sqr...         1.000000   
3   {'rf__max_depth': None, 'rf__max_features': 'l...         1.000000   
8   {'rf__max_depth': 10, 'rf__max_features': 'sqr...         1.000000   
7   {'rf__max_depth': 10, 'rf__max_features': 'sqr...         1.000000   
1   {'rf__max_depth': None, 'rf__max_features': 's...         0.985714   
6   {'rf__max_depth': 10, 'rf__max_features': 'sqr...         0.985714   
13  {'rf__max_depth': 20, 'rf__max_features': 'sqr...         0.985714   
14  {'rf__max_depth': 20, 'rf__max_features': 'sqr...         0.984615   
11  {'rf__max_depth': 10, 'rf__max_features': 'log...         0.970330   
4   {'rf__max_depth': None, 'rf__max_features': 'l...         0.956044   
5   {'rf__max_depth': None, 'rf__max_f

In [26]:
# Assuming 'grid' is your fitted GridSearchCV object
# Create a DataFrame from the cv_results_ dictionary
results_df = pd.DataFrame(grid.cv_results_)

# Sort the DataFrame by the mean test score in descending order
results_df_sorted = results_df.sort_values(by='mean_test_score', ascending=False)

# Select and rename the columns for a cleaner display
results_df_sorted = results_df_sorted[['param_rf__n_estimators', 'param_rf__max_depth', 'param_rf__max_features', 'mean_test_score', 'rank_test_score']]
results_df_sorted.columns = ['n_estimators', 'max_depth', 'max_features', 'Mean Accuracy', 'Rank']

# Display the top 5 results as a clean markdown table
print("### Top 5 GridSearchCV Results\n")
print(results_df_sorted.to_markdown(index=False))

### Top 5 GridSearchCV Results

|   n_estimators |   max_depth | max_features   |   Mean Accuracy |   Rank |
|---------------:|------------:|:---------------|----------------:|-------:|
|            100 |             | sqrt           |        1        |      1 |
|            300 |             | sqrt           |        1        |      1 |
|            100 |          20 | sqrt           |        1        |      1 |
|            100 |             | log2           |        1        |      1 |
|            300 |          10 | sqrt           |        1        |      1 |
|            200 |          10 | sqrt           |        1        |      1 |
|            200 |             | sqrt           |        0.985714 |      7 |
|            100 |          10 | sqrt           |        0.985714 |      7 |
|            200 |          20 | sqrt           |        0.985714 |      7 |
|            300 |          20 | sqrt           |        0.984615 |     10 |
|            300 |          10 | log2       