In [1]:
import pandas as pd

# Load the Excel file
file_path = "MLL_4.xlsx"   # Make sure the file is in the same folder as your notebook
df = pd.read_excel(file_path)

In [2]:
# Display the first few rows
print(df.head())

   31307  31308_at  31309_r_at  31310_at  31311_at  31312_at  31313_at  \
0 -135.7    -100.1       -94.6      -230       0.6     -50.4     -36.3   
1  -80.0     -23.0        -6.0      -145     491.0     290.0    -235.0   
2  -91.0    -130.0       -27.0       -51     236.0    -163.0    -304.0   
3 -144.0    -124.0       -26.0      -139     -88.0      34.0    -411.0   
4  -89.0     -25.0       -64.0      -112     452.0     183.0     107.0   

   31314_at  31315_at  31316_at  ...  101_at  102_at  103_at  104_at  105_at  \
0     139.5      31.6     -32.2  ...  -225.2   242.5   101.7   473.1   -59.9   
1      41.0    4602.0     -37.0  ...  -175.0   143.0    96.0   301.0   -50.0   
2     -35.0     498.0     -56.0  ...  -308.0   184.0   -32.0   350.0   -11.0   
3     118.0    -239.0    -104.0  ...   731.0   106.0  -330.0   -36.0  -190.0   
4     233.0      38.0     -35.0  ...   182.0   426.0   155.0   607.0    50.0   

   106_at  107_at  108_g_at  109_at  class  
0   217.9   275.6    -461.6  

In [3]:
# Check basic info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Columns: 12534 entries, 31307 to class
dtypes: float64(11224), int64(1310)
memory usage: 6.9 MB
None


In [4]:
# Check summary statistics for numeric columns
print(df.describe())

            31307    31308_at  31309_r_at    31310_at     31311_at  \
count   72.000000   72.000000   72.000000   72.000000    72.000000   
mean  -188.495833  -94.098611  -31.716667  -37.708333   376.938889   
std     97.886654  149.780756   45.519842  167.468596   274.066937   
min   -398.000000 -875.000000 -149.000000 -545.000000  -173.000000   
25%   -251.000000 -152.000000  -64.000000 -130.500000   178.500000   
50%   -167.000000  -65.000000  -30.000000  -74.500000   340.500000   
75%   -115.750000  -19.500000   -8.250000   79.000000   537.750000   
max     75.000000  132.000000   92.000000  377.000000  1059.000000   

         31312_at     31313_at    31314_at     31315_at    31316_at  ...  \
count   72.000000    72.000000   72.000000    72.000000   72.000000  ...   
mean    -7.130556  -397.726389   27.729167   538.091667  -67.266667  ...   
std    190.598033   323.552802  196.632494  1230.456868   53.191909  ...   
min   -710.000000 -1446.000000 -640.000000  -572.000000 -320.0000

In [5]:
# Check how many missing values each column has
print(df.isnull().sum())

31307         0
31308_at      0
31309_r_at    0
31310_at      0
31311_at      0
             ..
106_at        0
107_at        0
108_g_at      0
109_at        0
class         0
Length: 12534, dtype: int64


In [6]:
print(df.columns)

Index([       31307,   '31308_at', '31309_r_at',   '31310_at',   '31311_at',
         '31312_at',   '31313_at',   '31314_at',   '31315_at',   '31316_at',
       ...
           '101_at',     '102_at',     '103_at',     '104_at',     '105_at',
           '106_at',     '107_at',   '108_g_at',     '109_at',      'class'],
      dtype='object', length=12534)


In [7]:
# Check distinct class labels and their counts
print(df['class'].value_counts())

# Check the unique values directly
print(df['class'].unique())

class
2    28
0    24
1    20
Name: count, dtype: int64
[0 1 2]


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [9]:
# 1) Ensure column names are strings (important for scikit-learn)
df.columns = df.columns.astype(str)

# 2) Shuffle / randomize the dataset (keeps target distribution when we stratify later)
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 3) Separate features and target
target_col = 'class'   # as confirmed earlier
X = df_shuffled.drop(columns=[target_col])
y = df_shuffled[target_col]

# 4) Normalize features
#    Using StandardScaler (zero mean, unit variance). If you prefer MinMax use MinMaxScaler instead.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # returns numpy array

# If you want X as a DataFrame with same column names (optional)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# 5) Train/test split (stratify to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=0.5, random_state=42, stratify=y
)

# 6) Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# 7) Predict & Evaluate
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print("âœ… Accuracy:", accuracy)
print("\nðŸ“Š Confusion Matrix:\n", cm)
print("\nðŸ“ˆ Classification Report:\n", report)

# 8) Save model and scaler for later use
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\nSaved: rf_model.pkl and scaler.pkl")

âœ… Accuracy: 0.9722222222222222

ðŸ“Š Confusion Matrix:
 [[11  0  1]
 [ 0 10  0]
 [ 0  0 14]]

ðŸ“ˆ Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       1.00      1.00      1.00        10
           2       0.93      1.00      0.97        14

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36


Saved: rf_model.pkl and scaler.pkl
