In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv('C:/Users/balam/Documents/AIcancerACC/prepared_dataset.csv')

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 9', 'Unnamed: 10'])

# Handle categorical data (if necessary)
data['gene_type'] = LabelEncoder().fit_transform(data['gene_type'])

# Define features and target
X = data[['unstranded', 'stranded_first', 'stranded_second', 
          'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded']]
y = data['label']

# Impute missing values in X
imputer = SimpleImputer(strategy='mean')  # Replace 'mean' with 'median' or 'most_frequent' if appropriate
X = imputer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7727343716107761
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.59      0.71    108759
           1       0.72      0.94      0.81    121751

    accuracy                           0.77    230510
   macro avg       0.81      0.76      0.76    230510
weighted avg       0.80      0.77      0.76    230510



In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Load the dataset
data = pd.read_csv('C:/Users/balam/Documents/AIcancerACC/prepared_dataset.csv')

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 9', 'Unnamed: 10'])

# Handle categorical data (if necessary)
data['gene_type'] = LabelEncoder().fit_transform(data['gene_type'])

# Define features and target
X = data[['unstranded', 'stranded_first', 'stranded_second', 
          'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded']]
y = data['label']

# Impute missing values in X
imputer = SimpleImputer(strategy='mean')  # Replace 'mean' with 'median' or 'most_frequent' if appropriate
X = imputer.fit_transform(X)

# Encode the target variable if it has multiple classes
y = LabelEncoder().fit_transform(y)
y = to_categorical(y)  # Convert to one-hot encoding for neural networks

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the Neural Network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),  # Input layer
    Dropout(0.2),  # Dropout to prevent overfitting
    Dense(32, activation='relu'),  # Hidden layer
    Dropout(0.2),
    Dense(y_train.shape[1], activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)  # Convert probabilities to class labels
y_test_classes = y_test.argmax(axis=1)

# Classification report
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.5712680816650391
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.81      0.64    108759
           1       0.68      0.36      0.47    121751

    accuracy                           0.57    230510
   macro avg       0.60      0.58      0.56    230510
weighted avg       0.61      0.57      0.55    230510



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
import optuna

# Load the dataset
data = pd.read_csv('C:/Users/balam/Documents/AIcancerACC/prepared_dataset.csv')

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 9', 'Unnamed: 10'])

# Handle categorical data (if necessary)
data['gene_type'] = LabelEncoder().fit_transform(data['gene_type'])

# Define features and target
X = data[['unstranded', 'stranded_first', 'stranded_second', 
          'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded']]
y = data['label']

# Impute missing values in X
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the objective function for Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # Train the Random Forest Classifier
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Return the accuracy score
    return accuracy_score(y_test, y_pred)

# Run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best hyperparameters
best_params = study.best_params
print("Best Parameters:", best_params)

# Train the final model with the best parameters
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)

# Make predictions
y_pred = final_model.predict(X_test)

# Evaluate the final model
print("Final Accuracy:", accuracy_score(y_test, y_pred))
print("Final Classification Report:\n", classification_report(y_test, y_pred))


  from pandas.core import (
[I 2025-01-09 20:44:14,870] A new study created in memory with name: no-name-2c08b6d5-2b37-49b1-a189-abacf56172ba
[I 2025-01-09 21:16:41,553] Trial 0 finished with value: 0.7230922736540714 and parameters: {'n_estimators': 263, 'max_depth': 17, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': None}. Best is trial 0 with value: 0.7230922736540714.
[I 2025-01-09 21:19:42,449] Trial 1 finished with value: 0.5897748470782178 and parameters: {'n_estimators': 103, 'max_depth': 9, 'min_samples_split': 19, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7230922736540714.
[I 2025-01-09 21:25:39,634] Trial 2 finished with value: 0.715821439416945 and parameters: {'n_estimators': 111, 'max_depth': 24, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7230922736540714.
[I 2025-01-09 21:49:14,377] Trial 3 finished with value: 0.7272179081167846 and parameters: {'n_estimators'