In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Read Excel file
df = pd.read_excel(r"C:\Users\HP\Project\MLL_4.xlsx")

In [3]:
df

Unnamed: 0,31307,31308_at,31309_r_at,31310_at,31311_at,31312_at,31313_at,31314_at,31315_at,31316_at,...,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at,class
0,-135.7,-100.1,-94.6,-230,0.6,-50.4,-36.3,139.5,31.6,-32.2,...,-225.2,242.5,101.7,473.1,-59.9,217.9,275.6,-461.6,1115.5,0
1,-80.0,-23.0,-6.0,-145,491.0,290.0,-235.0,41.0,4602.0,-37.0,...,-175.0,143.0,96.0,301.0,-50.0,242.0,222.0,-330.0,2481.0,0
2,-91.0,-130.0,-27.0,-51,236.0,-163.0,-304.0,-35.0,498.0,-56.0,...,-308.0,184.0,-32.0,350.0,-11.0,837.0,174.0,-99.0,376.0,0
3,-144.0,-124.0,-26.0,-139,-88.0,34.0,-411.0,118.0,-239.0,-104.0,...,731.0,106.0,-330.0,-36.0,-190.0,999.0,255.0,-353.0,1603.0,0
4,-89.0,-25.0,-64.0,-112,452.0,183.0,107.0,233.0,38.0,-35.0,...,182.0,426.0,155.0,607.0,50.0,249.0,1635.0,-780.0,1103.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,-324.0,-168.0,-49.0,312,1059.0,-24.0,-404.0,12.0,101.0,-55.0,...,-422.0,528.0,220.0,643.0,187.0,407.0,-564.0,-1736.0,346.0,2
68,-148.0,-104.0,29.0,72,465.0,162.0,-895.0,33.0,1736.0,38.0,...,128.0,94.0,66.0,556.0,63.0,200.0,120.0,-757.0,825.0,2
69,-230.0,-66.0,-69.0,377,686.0,-44.0,-123.0,7.0,310.0,-119.0,...,-230.0,257.0,71.0,581.0,64.0,35.0,829.0,-2015.0,385.0,2
70,-359.0,-52.0,-147.0,120,564.0,-52.0,-584.0,64.0,2528.0,-90.0,...,-236.0,88.0,94.0,143.0,232.0,434.0,-87.0,-2038.0,1228.0,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Columns: 12534 entries, 31307 to class
dtypes: float64(11224), int64(1310)
memory usage: 6.9 MB


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [6]:
# -----------------------------
# 2️ Separate features and target
# -----------------------------
X = df.drop(columns=['class']).astype(float)
X.columns = X.columns.astype(str)  # ensure column names are strings
y = df['class']

In [7]:
# -----------------------------
# 3️ Scale all features for autoencoder
# -----------------------------
scaler_all = StandardScaler()
X_scaled_all = scaler_all.fit_transform(X)

In [8]:
# -----------------------------
# 4️ Folded Autoencoder for feature selection
# -----------------------------
hidden_dim = 5
autoencoder = MLPRegressor(hidden_layer_sizes=(hidden_dim,),
                           max_iter=100,
                           random_state=0)
autoencoder.fit(X_scaled_all, X_scaled_all)



In [9]:
# -----------------------------
# 5️ Compute feature importance
# -----------------------------
coefs = autoencoder.coefs_[0]  # weights from input layer to hidden layer
feature_importance = np.sum(np.abs(coefs), axis=1)

In [10]:
# -----------------------------
# 6️ Select top 20 features
# -----------------------------
k = 20
top_features_idx = np.argsort(feature_importance)[::-1][:k]
top_features = X.columns[top_features_idx]

print("Top 20 features:")
print(top_features)

Top 20 features:
Index(['39139_at', '32059_at', '35755_at', '41589_at', '34874_at', '36167_at',
       '32363_at', '39105_at', '36309_at', '33256_at', '35531_at', '37747_at',
       '34892_at', '41325_at', '35299_at', '37345_at', '34332_at', '37742_at',
       '39220_at', '31551_at'],
      dtype='object')


In [11]:
# -----------------------------
# 7️ Shuffle & normalize only top 20 features
# -----------------------------
df_selected = df[top_features.tolist() + ['class']]
df_shuffled = df_selected.sample(frac=1, random_state=42).reset_index(drop=True)

X_top = df_shuffled[top_features].astype(float)
y_top = df_shuffled['class']

scaler_top = StandardScaler()
X_top_scaled = scaler_top.fit_transform(X_top)

In [12]:
# -----------------------------
# 8️ Optional: combine scaled features with target
# -----------------------------
df_final = pd.DataFrame(X_top_scaled, columns=top_features)
df_final['class'] = y_top.values

# -----------------------------
# Now ready for modeling
# X_top_scaled -> normalized top 20 features
# y_top -> target labels
# df_final -> shuffled + normalized dataframe

In [13]:
df_final

Unnamed: 0,39139_at,32059_at,35755_at,41589_at,34874_at,36167_at,32363_at,39105_at,36309_at,33256_at,...,37747_at,34892_at,41325_at,35299_at,37345_at,34332_at,37742_at,39220_at,31551_at,class
0,-0.444764,-0.094886,-0.750208,0.770023,0.162103,-0.759744,-0.321952,-0.080934,-1.039196,0.321397,...,-0.929786,0.027972,-0.824832,0.680110,-0.739954,-0.954860,0.128414,-0.004838,1.105267,0
1,0.609420,-0.098337,0.854900,-0.158633,0.162103,0.646933,-0.176745,0.777468,-1.062000,0.555860,...,0.325872,-0.308111,-0.194581,0.390439,-1.108465,0.350370,-0.358807,0.989254,-2.612549,2
2,-0.304280,-0.653989,0.384451,0.240213,-0.856850,-0.315248,-0.342996,-0.340037,-0.104262,-0.313609,...,0.855334,0.655101,-0.321586,-0.046751,0.558061,-0.875851,0.126616,-0.466935,0.282410,0
3,-0.342189,0.251966,-0.793532,0.092283,-0.992276,-1.002225,-0.113611,0.152852,1.291299,-0.320447,...,-0.108569,0.017557,0.642030,-0.643529,0.461491,0.006839,-0.813067,0.133791,-0.268904,0
4,0.103791,-0.947345,0.775846,-1.837358,-0.691802,0.523446,-0.193580,-0.245098,0.192181,-0.167069,...,0.948639,-0.389176,-0.176437,0.004209,-0.730083,-0.860049,-0.751341,0.725199,-0.705018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,-0.307625,-0.184618,-1.312112,0.784906,-1.229510,-1.661453,-0.250400,-0.832530,-1.426852,0.428859,...,0.322351,-1.037135,0.114815,-0.784342,1.420114,-0.509249,-1.058775,0.511624,0.046774,0
68,-0.005474,0.088031,-0.192954,-2.861262,-0.670954,-0.732616,-0.214625,-0.450798,0.009755,-0.352686,...,-0.509913,-1.276953,-0.293893,0.017620,-1.129852,-0.578777,-0.561966,0.791212,0.323553,0
69,0.063653,-2.217407,0.964181,-0.408656,-1.091391,-0.730073,-0.088358,0.253329,-1.632082,-0.489456,...,0.501039,1.623380,-0.178347,-1.917549,-1.350301,-1.266156,-0.792692,-3.239520,-3.895457,2
70,-0.352781,-0.022409,0.348024,-1.152772,-0.502431,-0.073925,-0.532397,-0.462666,-1.267229,0.077164,...,-0.110286,-1.217280,-0.545039,-0.695831,-0.665923,-0.450782,-0.307268,-0.816421,0.435761,0


In [14]:
# Display distinct values of a column, e.g. 'City'
unique_values = df['class'].unique()

print(unique_values)

[0 1 2]


In [15]:
# Count empty (NaN) cells in each column
empty_counts = df.isna().sum()

# Print the result
print("Number of empty cells in each column:")
print(empty_counts)

Number of empty cells in each column:
31307         0
31308_at      0
31309_r_at    0
31310_at      0
31311_at      0
             ..
106_at        0
107_at        0
108_g_at      0
109_at        0
class         0
Length: 12534, dtype: int64


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
# -----------------------------
# 1️⃣ Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_top_scaled, y_top, test_size=0.2, random_state=42, stratify=y_top
)

# -----------------------------
# 2️⃣ Define Random Forest model
# -----------------------------
rf_model = RandomForestClassifier(
    n_estimators=100,       # number of trees
    max_depth=None,         # grow trees fully
    random_state=42
)

# -----------------------------
# 3️⃣ Train the model
# -----------------------------
rf_model.fit(X_train, y_train)

# -----------------------------
# 4️⃣ Make predictions
# -----------------------------
y_pred = rf_model.predict(X_test)

# -----------------------------
# 5️⃣ Evaluate the model
# -----------------------------
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.5333
Confusion Matrix:
[[3 1 1]
 [1 2 1]
 [2 1 3]]
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.50      0.50      0.50         4
           2       0.60      0.50      0.55         6

    accuracy                           0.53        15
   macro avg       0.53      0.53      0.53        15
weighted avg       0.54      0.53      0.53        15



In [18]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# -----------------------------
# Confusion Matrix
# -----------------------------
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# -----------------------------
# Precision and Recall (per class)
# -----------------------------
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)

print("\nPrecision per class:", precision)
print("Recall (Sensitivity) per class:", recall)

# -----------------------------
# Specificity and Negative Predictive Value (NPV)
# -----------------------------
specificity = []
npv = []
n_classes = len(np.unique(y_test))

for i in range(n_classes):
    TP = cm[i, i]
    FP = cm[:, i].sum() - TP
    FN = cm[i, :].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    
    spec = TN / (TN + FP) if (TN + FP) != 0 else 0
    npv_val = TN / (TN + FN) if (TN + FN) != 0 else 0
    
    specificity.append(spec)
    npv.append(npv_val)

print("Specificity per class:", specificity)
print("Negative Predictive Value (NPV) per class:", npv)

Confusion Matrix:
 [[3 1 1]
 [1 2 1]
 [2 1 3]]

Precision per class: [0.5 0.5 0.6]
Recall (Sensitivity) per class: [0.6 0.5 0.5]
Specificity per class: [np.float64(0.7), np.float64(0.8181818181818182), np.float64(0.7777777777777778)]
Negative Predictive Value (NPV) per class: [np.float64(0.7777777777777778), np.float64(0.8181818181818182), np.float64(0.7)]


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [20]:
# -----------------------------
# 1️⃣ Define RandomForest and parameter grid
# -----------------------------
rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
}

# -----------------------------
# 2️⃣ Grid Search with 5-fold cross-validation
# -----------------------------
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,  # use all cores
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# -----------------------------
# 3️⃣ Evaluate on test set
# -----------------------------
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

test_acc = accuracy_score(y_test, y_pred_best)
print("\nTest Accuracy with Best RF:", test_acc)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.6303030303030303

Test Accuracy with Best RF: 0.6666666666666666

Confusion Matrix:
[[4 1 0]
 [0 3 1]
 [2 1 3]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.60      0.75      0.67         4
           2       0.75      0.50      0.60         6

    accuracy                           0.67        15
   macro avg       0.67      0.68      0.66        15
weighted avg       0.68      0.67      0.66        15

