# Model Training

## Importing Libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, confusion_matrix

## Importing Data

In [2]:
# Assuming you have your data in a DataFrame df and the target in a Series y
df = pd.read_csv('/content/cancer_gene_expression.csv')  # Load your dataset
y = df['Cancer_Type']  # Adjust this according to your target variable


## Selected Features

In [3]:
# List of selected features
selected_features = ['gene_7705', 'gene_6199', 'gene_6763', 'gene_6963', 'gene_6707', 'gene_6721',
                      'gene_7583', 'gene_7619', 'gene_7913', 'gene_7850', 'gene_7071', 'gene_5788',
                      'gene_7764', 'gene_6185', 'gene_7090', 'gene_7394', 'gene_4593', 'gene_7774',
                      'gene_7755', 'gene_7224', 'gene_6922', 'gene_7954', 'gene_6271', 'gene_4867',
                      'gene_7813', 'gene_7976', 'gene_5809', 'gene_6719', 'gene_6988', 'gene_7428',
                      'gene_7773', 'gene_7922', 'gene_7651', 'gene_5603', 'gene_4079', 'gene_7502',
                      'gene_5544', 'gene_7787', 'gene_7236', 'gene_7953', 'gene_6882', 'gene_7565',
                      'gene_7703', 'gene_7685', 'gene_7333', 'gene_6134', 'gene_7896', 'gene_7277',
                      'gene_7043', 'gene_6445', 'gene_6649', 'gene_6747', 'gene_7307', 'gene_7415',
                      'gene_6403', 'gene_7614', 'gene_7205', 'gene_7766', 'gene_7284', 'gene_7760',
                      'gene_7821', 'gene_5812', 'gene_7898', 'gene_7944', 'gene_7854', 'gene_4338',
                      'gene_7594', 'gene_7048', 'gene_7215', 'gene_7785', 'gene_7864', 'gene_7733',
                      'gene_4937', 'gene_6652', 'gene_7359', 'gene_7422', 'gene_7259', 'gene_6408',
                      'gene_7273', 'gene_6289', 'gene_7297', 'gene_7178', 'gene_7659', 'gene_6182',
                      'gene_4874', 'gene_7476', 'gene_7931', 'gene_7509', 'gene_6377', 'gene_5623',
                      'gene_6827', 'gene_7570', 'gene_7990', 'gene_7212', 'gene_7294', 'gene_7416',
                      'gene_7634', 'gene_7335', 'gene_7838', 'gene_6107', 'gene_7219', 'gene_5549',
                      'gene_7554', 'gene_6256', 'gene_5208', 'gene_7504', 'gene_7871', 'gene_7788',
                      'gene_6421', 'gene_7989', 'gene_7805', 'gene_7835', 'gene_6131', 'gene_7376',
                      'gene_7832', 'gene_7032', 'gene_7361', 'gene_6575', 'gene_4570', 'gene_6904',
                      'gene_6916', 'gene_7720', 'gene_7231', 'gene_7725', 'gene_5861', 'gene_5853',
                      'gene_7563', 'gene_7985']


## Extract Features

In [4]:
# Extract features and target variable
X = df[selected_features]
y = df['Cancer_Type']  # Replace with your actual target variable

## Handling Null Values

In [5]:
# Handle missing values in features
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# Handle missing values in target variable (if any)
# Here, we'll drop rows with missing target values. You could also impute them if it makes sense for your data.
y = y.dropna()
X_imputed = X_imputed.loc[y.index] # Align features with non-missing targets

# Ensure no NaN values remain
if np.isnan(X_imputed).sum().sum() == 0 and y.isnull().sum() == 0:
    print("No NaN values remain after handling missing data.")
else:
    print("There are still NaN values in the dataset.")

No NaN values remain after handling missing data.


## Data Splitting

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Traininig Model

In [11]:
# Initialize and train the GradientBoostingClassifier with adjusted parameters
clf = GradientBoostingClassifier(
    n_estimators=200,  # Adjusted value
    max_depth=4,       # Adjusted value
    learning_rate=0.05  # Adjusted value
)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

## Model Evaluation

In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.95
Precision: 0.95
F1 Score: 0.95
Classification Report:
              precision    recall  f1-score   support

        BRCA       0.93      0.98      0.95        86
        COAD       0.94      0.73      0.82        22
        KIRC       0.98      0.98      0.98        44
        LUAD       0.98      0.95      0.96        43
        PRAD       0.94      0.98      0.96        46

    accuracy                           0.95       241
   macro avg       0.95      0.92      0.93       241
weighted avg       0.95      0.95      0.95       241

Confusion Matrix:
[[84  0  0  0  2]
 [ 4 16  1  1  0]
 [ 0  0 43  0  1]
 [ 1  1  0 41  0]
 [ 1  0  0  0 45]]
