In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import pyreadr

In [4]:
# Read the data
r = pyreadr.read_r("miniTCGA.3349x4006.rds")
df = r[None]

# Split into training and prediction set
data_train = df.dropna(subset=['response'])
data_predict = df[df['response'].isna()]

# Split into train and test fold
trainfold, testfold = train_test_split(data_train, test_size=0.20, random_state=0)

In [5]:

#stupid classifier


# We fit our model (simple logistic regression on pc2 and pc3 with interaction)
model = LGBMClassifier()
model.fit(trainfold[['pc2', 'pc3']], trainfold['response'])



[LightGBM] [Info] Number of positive: 1554, number of negative: 187
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1741, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892590 -> initscore=2.117479
[LightGBM] [Info] Start training from score 2.117479


In [7]:
# smart classifier

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
	'num_leaves': [31, 127],
	'reg_alpha': [0.1, 0.5],
	'min_data_in_leaf': [30, 50, 100, 300, 400],
	'lambda_l1': [0, 1, 1.5],
	'lambda_l2': [0, 1]
	}

# Create a base model
lgbm = LGBMClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lgbm, param_grid = param_grid, 
						  cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(trainfold[['pc2', 'pc3']], trainfold['response'])

# Print the best parameters
print(grid_search.best_params_)

# Use the best model to make predictions
best_grid = grid_search.best_estimator_
predicted = best_grid.predict(testfold[['pc2', 'pc3']])

# Calculate the test error
test_error = np.sum(observed != predicted) / len(observed)
print(f"Test error: {test_error}")

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[LightGBM] [Info] Number of positive: 1554, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1741, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892590 -> initscore=2.117479
[LightGBM] [Info] Start training from score 2.117479
{'lambda_l1': 1.5, 'lambda_l2': 1, 'min_data_in_leaf': 100, 'num_leaves': 31, 'reg_alpha': 0.1}
Test error: 0.08944954128440367


In [6]:
# We predict on the test fold
predicted = model.predict(testfold[['pc2', 'pc3']])

# We compare with the observed values and calculate error rate
observed = testfold['response']

# Our guess on the general error rate of the model (very unprecise!)
test_error = np.sum(observed != predicted) / len(observed)
print(f"Test error: {test_error}")



Test error: 0.11238532110091744


In [None]:
# Predict the real unknown data
# First we fit the model to all of our known data
model.fit(data_train[['pc2', 'pc3']], data_train['response'])

# Then we predict on the unknown data
predicted = model.predict(data_predict[['pc2', 'pc3']])

# The predictions must have the following column and the row order must be the same as the original!
submission = pd.DataFrame({'predicted': predicted})

print(submission.head())

In [8]:
# Fit the model to all of your known data
best_grid.fit(data_train[['pc2', 'pc3']], data_train['response'])

# Predict on the real unknown data
predicted_real = best_grid.predict(data_predict[['pc2', 'pc3']])

# Create a DataFrame for the predicted values
submission = pd.DataFrame({'predicted': predicted_real})

# Print the first few rows of the submission DataFrame
print(submission.head())

[LightGBM] [Info] Number of positive: 1944, number of negative: 233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 2177, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892972 -> initscore=2.121465
[LightGBM] [Info] Start training from score 2.121465
  predicted
0     Tumor
1     Tumor
2     Tumor
3     Tumor
4     Tumor


In [9]:
import pyreadr

# Define the team name, team people, and team error
team_name = "The_Brogrammers"
team_people = ["Ane", "Peter"]
team_error = test_error

# Extract the predicted column from the submission DataFrame
team_predictions = submission['predicted']

# Save all the stuff in one object
result = {"team_name": team_name, "team_people": team_people, 
		  "team_error": team_error, "team_predictions": team_predictions}

# Write the object to an RData file
pyreadr.write_rds(f"minitcga_cancer_classification.{team_name}.rds", result)

PyreadrError: df must be a pandas data frame

In [10]:
import pandas as pd

# Create a DataFrame from the data
df = pd.DataFrame({
    'team_name': [team_name],
    'team_people': [team_people],
    'team_error': [team_error],
    'team_predictions': [team_predictions.tolist()]  # Convert to list because pandas doesn't support array columns
})

# Write the DataFrame to an RDS file
pyreadr.write_rds(f"minitcga_cancer_classification.{team_name}.rds", df)