In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# --- Step 1: Import the dataset ---


# Load Excel file (make sure it's in the same folder)
file_path = "MLL_4.xlsx"
df = pd.read_excel(file_path)

In [3]:
df

Unnamed: 0,31307,31308_at,31309_r_at,31310_at,31311_at,31312_at,31313_at,31314_at,31315_at,31316_at,...,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at,class
0,-135.7,-100.1,-94.6,-230,0.6,-50.4,-36.3,139.5,31.6,-32.2,...,-225.2,242.5,101.7,473.1,-59.9,217.9,275.6,-461.6,1115.5,0
1,-80.0,-23.0,-6.0,-145,491.0,290.0,-235.0,41.0,4602.0,-37.0,...,-175.0,143.0,96.0,301.0,-50.0,242.0,222.0,-330.0,2481.0,0
2,-91.0,-130.0,-27.0,-51,236.0,-163.0,-304.0,-35.0,498.0,-56.0,...,-308.0,184.0,-32.0,350.0,-11.0,837.0,174.0,-99.0,376.0,0
3,-144.0,-124.0,-26.0,-139,-88.0,34.0,-411.0,118.0,-239.0,-104.0,...,731.0,106.0,-330.0,-36.0,-190.0,999.0,255.0,-353.0,1603.0,0
4,-89.0,-25.0,-64.0,-112,452.0,183.0,107.0,233.0,38.0,-35.0,...,182.0,426.0,155.0,607.0,50.0,249.0,1635.0,-780.0,1103.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,-324.0,-168.0,-49.0,312,1059.0,-24.0,-404.0,12.0,101.0,-55.0,...,-422.0,528.0,220.0,643.0,187.0,407.0,-564.0,-1736.0,346.0,2
68,-148.0,-104.0,29.0,72,465.0,162.0,-895.0,33.0,1736.0,38.0,...,128.0,94.0,66.0,556.0,63.0,200.0,120.0,-757.0,825.0,2
69,-230.0,-66.0,-69.0,377,686.0,-44.0,-123.0,7.0,310.0,-119.0,...,-230.0,257.0,71.0,581.0,64.0,35.0,829.0,-2015.0,385.0,2
70,-359.0,-52.0,-147.0,120,564.0,-52.0,-584.0,64.0,2528.0,-90.0,...,-236.0,88.0,94.0,143.0,232.0,434.0,-87.0,-2038.0,1228.0,2


In [4]:
target_col = 'class'
X_df = df.drop(columns=[target_col])   # keep as DataFrame for names
y = df[target_col]

feature_names = X_df.columns            # save names BEFORE converting
X = X_df.values                         # raw numpy matrix for model

# scale (good practice)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# fit autoencoder-like MLPRegressor
from sklearn.neural_network import MLPRegressor
hidden_dim = 5
autoencoder = MLPRegressor(hidden_layer_sizes=(hidden_dim,),
                           max_iter=500,
                           random_state=0)
autoencoder.fit(X_scaled, X_scaled)

# compute importance and top-k names
coefs = autoencoder.coefs_[0]                       # (n_features, hidden_dim)
feature_importance = np.sum(np.abs(coefs), axis=1)  # importance per input feature

k = 20
top_features_idx = np.argsort(feature_importance)[::-1][:k]
top_features = np.array(feature_names)[top_features_idx]

print("Top 20 feature names:")
print(top_features)


Top 20 feature names:
['39139_at' '32059_at' '34874_at' '39105_at' '32363_at' '41589_at'
 '41325_at' '35755_at' '36167_at' '36309_at' '37747_at' '33256_at'
 '35299_at' '35531_at' '34332_at' '37742_at' '34892_at' '34602_at'
 '39220_at' '1038_s_at']


In [5]:
# Create a new dataset with only the top selected features + target column
selected_df = df[top_features.tolist() + [target_col]]

# Display the dataset in output
print("âœ… Dataset with selected features only:")
display(selected_df.head(10))   # show first 10 rows (you can change number)

âœ… Dataset with selected features only:


Unnamed: 0,39139_at,32059_at,34874_at,39105_at,32363_at,41589_at,41325_at,35755_at,36167_at,36309_at,...,33256_at,35299_at,35531_at,34332_at,37742_at,34892_at,34602_at,39220_at,1038_s_at,class
0,2142,303.5,442.1,577.2,140,-275.7,1540.1,995.1,4211.9,26.2,...,168.3,512,-250.8,527.6,700,1834.5,135.6,-73.3,940.2,0
1,3896,221.0,1590.0,846.0,85,-62.0,2368.0,1951.0,7373.0,-60.0,...,249.0,846,-435.0,2178.0,2899,4805.0,52.0,-260.0,2120.0,0
2,723,32.0,277.0,42.0,70,-799.0,793.0,300.0,3369.0,-47.0,...,112.0,136,-247.0,-10.0,56,2627.0,110.0,-173.0,1161.0,0
3,4153,-52.0,645.0,561.0,8,-578.0,252.0,3288.0,4567.0,-38.0,...,98.0,1300,-89.0,-15.0,1949,5771.0,-168.0,-178.0,3363.0,0
4,1958,203.0,1771.0,459.0,41,-48.0,4.0,1051.0,5070.0,-76.0,...,234.0,1499,-168.0,-81.0,2271,1853.0,-217.0,-109.0,869.0,0
5,2603,115.0,829.0,506.0,52,-129.0,1384.0,1106.0,5301.0,-61.0,...,179.0,937,-150.0,1105.0,1739,1423.0,189.0,-229.0,1517.0,0
6,1540,240.0,853.0,331.0,19,-107.0,5135.0,912.0,4862.0,-34.0,...,318.0,929,-76.0,1400.0,1480,2906.0,-54.0,-54.0,2082.0,0
7,2294,203.0,1782.0,228.0,139,-93.0,430.0,1568.0,6672.0,-5.0,...,64.0,1041,-120.0,1175.0,2046,2614.0,20.0,-109.0,1442.0,0
8,1195,273.0,746.0,-1.0,147,-14.0,464.0,1341.0,4059.0,14.0,...,349.0,1154,-184.0,130.0,1109,-159.0,-433.0,-122.0,1000.0,0
9,2898,210.0,1049.0,335.0,117,106.0,696.0,1158.0,5967.0,59.0,...,400.0,1905,-395.0,-255.0,3540,4179.0,-309.0,475.0,1646.0,0


In [6]:
display(selected_df)

Unnamed: 0,39139_at,32059_at,34874_at,39105_at,32363_at,41589_at,41325_at,35755_at,36167_at,36309_at,...,33256_at,35299_at,35531_at,34332_at,37742_at,34892_at,34602_at,39220_at,1038_s_at,class
0,2142,303.5,442.1,577.2,140,-275.7,1540.1,995.1,4211.9,26.2,...,168.3,512,-250.8,527.6,700,1834.5,135.6,-73.3,940.2,0
1,3896,221.0,1590.0,846.0,85,-62.0,2368.0,1951.0,7373.0,-60.0,...,249.0,846,-435.0,2178.0,2899,4805.0,52.0,-260.0,2120.0,0
2,723,32.0,277.0,42.0,70,-799.0,793.0,300.0,3369.0,-47.0,...,112.0,136,-247.0,-10.0,56,2627.0,110.0,-173.0,1161.0,0
3,4153,-52.0,645.0,561.0,8,-578.0,252.0,3288.0,4567.0,-38.0,...,98.0,1300,-89.0,-15.0,1949,5771.0,-168.0,-178.0,3363.0,0
4,1958,203.0,1771.0,459.0,41,-48.0,4.0,1051.0,5070.0,-76.0,...,234.0,1499,-168.0,-81.0,2271,1853.0,-217.0,-109.0,869.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,956,55.0,469.0,47.0,175,-159.0,503.0,1188.0,9533.0,-143.0,...,202.0,964,-299.0,91.0,356,520.0,-373.0,-491.0,1911.0,2
68,2430,178.0,3905.0,1041.0,158,-350.0,1325.0,2991.0,9985.0,-1.0,...,91.0,780,-263.0,430.0,4161,2777.0,-529.0,-266.0,4169.0,2
69,1225,154.0,1115.0,382.0,166,-310.0,1146.0,722.0,11375.0,6.0,...,-115.0,259,-528.0,799.0,1227,358.0,87.0,32.0,1073.0,2
70,2309,118.0,1098.0,816.0,84,-320.0,853.0,2665.0,6419.0,30.0,...,173.0,405,-329.0,1326.0,2860,544.0,72.0,-508.0,1073.0,2


In [7]:
# Convert all column names to strings (important fix)
X_df.columns = X_df.columns.astype(str)

# Initialize scaler with range (-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))

# Fit and transform
X_scaled = scaler.fit_transform(X_df)

# Convert back to DataFrame with same column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

# Show scaled dataset
print("âœ… Dataset scaled between (-1, 1):")
display(X_scaled_df)

âœ… Dataset scaled between (-1, 1):


Unnamed: 0,31307,31308_at,31309_r_at,31310_at,31311_at,31312_at,31313_at,31314_at,31315_at,31316_at,...,100_g_at,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at
0,0.109091,0.539027,-0.548548,-0.316703,-0.718182,0.3192,0.765435,0.365149,-0.810159,0.607821,...,-0.909981,-0.270129,-0.010619,-0.440078,-0.104485,-0.102482,-0.940869,-0.438199,0.733893,-0.090334
1,0.344609,0.692155,0.186722,-0.132321,0.077922,1.0000,0.516594,0.192644,0.627300,0.581006,...,-0.517064,-0.218509,-0.362832,-0.447471,-0.407212,-0.067376,-0.934504,-0.472669,0.812837,0.732008
2,0.298097,0.479643,0.012448,0.071584,-0.336039,0.0940,0.430182,0.059545,-0.663469,0.474860,...,-1.000000,-0.355270,-0.217699,-0.613489,-0.321020,0.070922,-0.777367,-0.503537,0.951410,-0.535682
3,0.073996,0.491559,0.020747,-0.119306,-0.862013,0.4880,0.296180,0.327496,-0.895267,0.206704,...,-0.802962,0.713111,-0.493805,-1.000000,-1.000000,-0.563830,-0.734583,-0.451447,0.799040,0.203252
4,0.306554,0.688183,-0.294606,-0.060738,0.014610,0.7860,0.944897,0.528897,-0.808146,0.592179,...,-0.832153,0.148586,0.638938,-0.370947,0.131047,0.287234,-0.932655,0.436013,0.542891,-0.097862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,-0.687104,0.404171,-0.170124,0.859002,1.000000,0.3720,0.304947,0.141856,-0.788331,0.480447,...,-0.369393,-0.472494,1.000000,-0.286641,0.194371,0.773050,-0.890928,-0.978135,-0.030594,-0.553749
68,0.057082,0.531281,0.477178,0.338395,0.035714,0.7440,-0.309956,0.178634,-0.274100,1.000000,...,0.104958,0.093059,-0.536283,-0.486381,0.041337,0.333333,-0.945596,-0.538264,0.556689,-0.265282
69,-0.289641,0.606753,-0.336100,1.000000,0.394481,0.3320,0.656857,0.133100,-0.722598,0.122905,...,-0.048723,-0.275064,0.040708,-0.479896,0.085312,0.336879,-0.989172,-0.082315,-0.197960,-0.530262
70,-0.835095,0.634558,-0.983402,0.442516,0.196429,0.3160,0.079524,0.232925,-0.025004,0.284916,...,-0.294698,-0.281234,-0.557522,-0.450065,-0.685136,0.932624,-0.883798,-0.671383,-0.211758,-0.022584


In [8]:
# Assuming your full dataset is in df
# Shuffle all rows randomly
df_randomized = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Show first few randomized rows
print("âœ… Randomized dataset:")
display(df_randomized.head(10))


âœ… Randomized dataset:


Unnamed: 0,31307,31308_at,31309_r_at,31310_at,31311_at,31312_at,31313_at,31314_at,31315_at,31316_at,...,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at,class
0,-89.0,-25.0,-64.0,-112,452.0,183.0,107.0,233.0,38.0,-35.0,...,182.0,426.0,155.0,607.0,50.0,249.0,1635.0,-780.0,1103.0,0
1,-362.0,-163.0,-40.0,283,980.0,-325.0,-429.0,210.0,611.0,-114.0,...,-935.0,420.0,27.0,1101.0,78.0,1407.0,736.0,-3042.0,723.0,2
2,-155.0,24.0,-51.0,-221,310.0,97.0,-243.0,214.0,-96.0,-59.0,...,944.0,142.0,44.0,412.0,34.0,620.0,-151.0,-474.0,1826.0,0
3,-135.7,-100.1,-94.6,-230,0.6,-50.4,-36.3,139.5,31.6,-32.2,...,-225.2,242.5,101.7,473.1,-59.9,217.9,275.6,-461.6,1115.5,0
4,-139.0,8.0,-72.0,-128,140.0,-47.0,-408.0,-25.0,-266.0,-41.0,...,-23.0,120.0,70.0,170.0,52.0,408.0,-140.0,-443.0,1064.0,1
5,-126.0,-147.0,-61.0,103,533.0,104.0,-53.0,-160.0,280.0,-67.0,...,-386.0,98.0,146.0,650.0,83.0,462.0,-326.0,-1799.0,1584.0,2
6,-195.0,-149.0,26.0,-120,329.0,136.0,-96.0,179.0,-17.0,-17.0,...,-149.0,74.0,39.0,623.0,-81.0,488.0,538.0,-842.0,1263.0,0
7,-74.0,-9.0,-33.0,-60,308.0,37.0,-149.0,-8.0,-88.0,-63.0,...,26.0,87.0,88.0,204.0,-34.0,203.0,380.0,-41.0,914.0,1
8,-103.0,-101.0,-37.0,-113,798.0,-368.0,-236.0,67.0,5787.0,-28.0,...,521.0,140.0,53.0,455.0,-29.0,1355.0,523.0,-1466.0,817.0,0
9,-389.0,-152.0,-40.0,-254,814.0,176.0,-708.0,-16.0,-399.0,-77.0,...,-402.0,59.0,26.0,836.0,215.0,108.0,31.0,-646.0,1681.0,2


In [9]:
# Convert all column names to strings
X_df.columns = X_df.columns.astype(str)

# Step 1: Scale the dataset between (-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X_df)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

# Step 2: Combine scaled features with class/target column
full_scaled_df = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

# Step 3: Randomize (shuffle) the combined dataset
full_randomized_df = full_scaled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 4: Split again if needed
X_final = full_randomized_df.drop(columns=[y.name])
y_final = full_randomized_df[y.name]

# Show final randomized dataset
print("âœ… Randomized scaled dataset (features + class):")
display(full_randomized_df)

âœ… Randomized scaled dataset (features + class):


Unnamed: 0,31307,31308_at,31309_r_at,31310_at,31311_at,31312_at,31313_at,31314_at,31315_at,31316_at,...,101_at,102_at,103_at,104_at,105_at,106_at,107_at,108_g_at,109_at,class
0,0.306554,0.688183,-0.294606,-0.060738,0.014610,0.7860,0.944897,0.528897,-0.808146,0.592179,...,0.148586,0.638938,-0.370947,0.131047,0.287234,-0.932655,0.436013,0.542891,-0.097862,0
1,-0.847780,0.414101,-0.095436,0.796095,0.871753,-0.2300,0.273638,0.488616,-0.627929,0.150838,...,-1.000000,0.617699,-0.536965,1.000000,0.386525,-0.626832,-0.142122,-0.814037,-0.326709,2
2,0.027484,0.785501,-0.186722,-0.297180,-0.215909,0.6140,0.506575,0.495622,-0.850291,0.458101,...,0.932134,-0.366372,-0.514916,-0.211961,0.230496,-0.834676,-0.712540,0.726455,0.337549,0
3,0.109091,0.539027,-0.548548,-0.316703,-0.718182,0.3192,0.765435,0.365149,-0.810159,0.607821,...,-0.270129,-0.010619,-0.440078,-0.104485,-0.102482,-0.940869,-0.438199,0.733893,-0.090334,0
4,0.095137,0.753724,-0.360996,-0.095445,-0.491883,0.3260,0.299937,0.077058,-0.903758,0.558659,...,-0.062211,-0.444248,-0.481193,-0.637643,0.294326,-0.890664,-0.705466,0.745051,-0.121349,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.251586,0.932473,0.410788,0.180043,-0.462662,0.7360,0.546650,0.222417,-0.831106,0.513966,...,0.321337,-0.415929,-0.465629,-0.563764,0.262411,-0.973590,-0.572347,0.800840,-0.486299,0
68,0.128964,0.624628,-0.452282,-0.067245,-0.574675,0.3580,0.469004,0.007005,-0.894638,0.715084,...,0.204113,-0.015929,-0.534371,-0.271768,0.021277,-0.954311,0.272026,0.705459,-0.330924,0
69,-0.699789,-1.000000,0.128631,0.360087,0.884740,-0.3540,-0.793363,0.623468,-0.430099,-1.000000,...,-0.580463,-0.589381,-0.815824,0.751979,0.081560,0.469695,-0.722830,-1.000000,-1.000000,2
70,1.000000,0.024826,0.452282,-1.000000,-0.441558,-1.0000,-0.206011,-1.000000,-0.818525,-0.234637,...,-0.388175,-0.688496,-0.367056,-0.217238,-0.131206,-0.989172,-0.566559,0.364127,0.164709,0


In [10]:
# Step 1: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.7, random_state=42
)

# Step 2: Initialize and train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 3: Make predictions
y_pred = model.predict(X_test)

# Step 4: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Step 5: Show result
print("âœ… Model training completed.")
print(f"ðŸŽ¯ Accuracy on test set: {accuracy:.4f}")

âœ… Model training completed.
ðŸŽ¯ Accuracy on test set: 0.9216
