W8 exercises

>Q Implement & compare 2 other methods (QDA/LDA/Naive Bayes) with your previous favorite. classifier
Compare the performance of LDA QDA and the method we have used for your first round of prediction (i.e. logistic regression or KNN). Compare their ability to classify Normal/Cancer by feeding them the same set of predictors as the one you used for your first submission round. Note you could use any predictors from the TCGA dataset (i.e. individual gene expression predictors and or PC1 to PC3).

In [21]:
import pyreadr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the .rds file
file_path = 'miniTCGA.3349x4006.rds'  # Replace with your file path
data = pyreadr.read_r(file_path)

# Assuming the data is in the first element of the dictionary
df = data[next(iter(data))]

# Drop rows with any missing values
df = df.dropna()

# Assuming 'response' is the target variable and the rest are predictors
X = df.drop(columns=['response'])
y = df['response']

# Convert string labels to binary numeric labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create pipelines for each classifier
lda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LinearDiscriminantAnalysis())])

qda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', QuadraticDiscriminantAnalysis())])

logistic_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', LogisticRegression(max_iter=1000))])

# Train the classifiers
lda_pipeline.fit(X_train, y_train)
qda_pipeline.fit(X_train, y_train)
logistic_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_lda = lda_pipeline.predict(X_test)
y_pred_qda = qda_pipeline.predict(X_test)
y_pred_logistic = logistic_pipeline.predict(X_test)

# Evaluate the performance
def evaluate_classifier(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary', pos_label=1)
    recall = recall_score(y_true, y_pred, average='binary', pos_label=1)
    f1 = f1_score(y_true, y_pred, average='binary', pos_label=1)
    return accuracy, precision, recall, f1

# Get performance metrics for each classifier
lda_metrics = evaluate_classifier(y_test, y_pred_lda)
qda_metrics = evaluate_classifier(y_test, y_pred_qda)
logistic_metrics = evaluate_classifier(y_test, y_pred_logistic)

# Print the results
print("LDA Metrics: Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*lda_metrics))
print("QDA Metrics: Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*qda_metrics))
print("Logistic Regression Metrics: Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*logistic_metrics))




LDA Metrics: Accuracy = 0.99, Precision = 1.00, Recall = 0.99, F1-Score = 1.00
QDA Metrics: Accuracy = 0.52, Precision = 0.94, Recall = 0.49, F1-Score = 0.64
Logistic Regression Metrics: Accuracy = 0.99, Precision = 1.00, Recall = 0.99, F1-Score = 1.00


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Naive Bayes": GaussianNB()
}

# Train each model
for name, model in models.items():
    model.fit(X_scaled, y)


>Q use cross validation to examine how accurate are the predictions you can make with LDA and QDA

In [22]:
import pyreadr
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score

# Load the .rds file
file_path = 'miniTCGA.3349x4006.rds'  # Replace with your file path
data = pyreadr.read_r(file_path)

# Assuming the data is in the first element of the dictionary
df = data[next(iter(data))]

# Drop rows with any missing values
df = df.dropna()

# Assuming 'response' is the target variable and the rest are predictors
X = df.drop(columns=['response'])
y = df['response']

# Convert string labels to binary numeric labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create pipelines for each classifier
lda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LinearDiscriminantAnalysis())])

qda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', QuadraticDiscriminantAnalysis())])

# Evaluate using cross-validation
cv_results_lda = cross_val_score(lda_pipeline, X, y, cv=5, scoring='accuracy')
cv_results_qda = cross_val_score(qda_pipeline, X, y, cv=5, scoring='accuracy')

# Print the cross-validation results
print("LDA Cross-Validation Accuracy: Mean = {:.2f}, Std = {:.2f}".format(cv_results_lda.mean(), cv_results_lda.std()))
print("QDA Cross-Validation Accuracy: Mean = {:.2f}, Std = {:.2f}".format(cv_results_qda.mean(), cv_results_qda.std()))




LDA Cross-Validation Accuracy: Mean = 0.99, Std = 0.00
QDA Cross-Validation Accuracy: Mean = 0.54, Std = 0.07




>Q Get ready submit your second best predictions as a TEAM using a rds format
Now you are ahead of the next assignment :0).

In [23]:
import pyreadr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the .rds file
file_path = 'miniTCGA.3349x4006.rds'  # Replace with your file path
data = pyreadr.read_r(file_path)

# Assuming the data is in the first element of the dictionary
df = data[next(iter(data))]

# Drop rows with any missing values
df = df.dropna()

# Assuming 'response' is the target variable and the rest are predictors
X = df.drop(columns=['response'])
y = df['response']

# Convert string labels to binary numeric labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create pipelines for each classifier
lda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LinearDiscriminantAnalysis())])

qda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', QuadraticDiscriminantAnalysis())])

# Train the classifiers
lda_pipeline.fit(X_train, y_train)
qda_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lda = lda_pipeline.predict(X_test)
y_pred_qda = qda_pipeline.predict(X_test)

# Evaluate the performance
def evaluate_classifier(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary', pos_label=1)
    recall = recall_score(y_true, y_pred, average='binary', pos_label=1)
    f1 = f1_score(y_true, y_pred, average='binary', pos_label=1)
    return accuracy, precision, recall, f1

# Get performance metrics for each classifier
lda_metrics = evaluate_classifier(y_test, y_pred_lda)
qda_metrics = evaluate_classifier(y_test, y_pred_qda)

# Print the results
print("LDA Test Performance:")
print("Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*lda_metrics))
print("\nClassification Report for LDA:\n", classification_report(y_test, y_pred_lda, target_names=label_encoder.classes_))

print("\nQDA Test Performance:")
print("Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*qda_metrics))
print("\nClassification Report for QDA:\n", classification_report(y_test, y_pred_qda, target_names=label_encoder.classes_))




LDA Test Performance:
Accuracy = 0.99, Precision = 1.00, Recall = 0.99, F1-Score = 1.00

Classification Report for LDA:
               precision    recall  f1-score   support

      Normal       0.95      0.99      0.97        70
       Tumor       1.00      0.99      1.00       584

    accuracy                           0.99       654
   macro avg       0.97      0.99      0.98       654
weighted avg       0.99      0.99      0.99       654


QDA Test Performance:
Accuracy = 0.52, Precision = 0.94, Recall = 0.49, F1-Score = 0.64

Classification Report for QDA:
               precision    recall  f1-score   support

      Normal       0.15      0.76      0.25        70
       Tumor       0.94      0.49      0.64       584

    accuracy                           0.52       654
   macro avg       0.55      0.62      0.45       654
weighted avg       0.86      0.52      0.60       654



