<a href="https://colab.research.google.com/github/VinayakKumarSingh/vinayak_git/blob/main/MainEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install joblib



In [None]:
from google.colab import drive
drive.mount('/content/drive')

import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from torch.utils.data import TensorDataset
import joblib
from sklearn.model_selection import ParameterGrid
import numpy as np

# Path to your JSON file in Google Drive
json_file_path = '/content/drive/MyDrive/Colab Notebooks/geeks_for_geeks.json'

# Load JSON data
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Extract relevant information and create a DataFrame
rows = []
for entry in data:
    row = {
        'name': entry.get('name', '').strip(),
        'time_complexity': entry.get('time_complexity', ''),
        'trustable_time_complexity': entry.get('trustable_time_complexity', False),
        'space_complexity': entry.get('space_complexity', ''),
        'trustable_space_complexity': entry.get('trustable_space_complexity', False),
        'code_CPP': entry['codes'].get('C++', {}).get('code', '')
    }
    rows.append(row)

df = pd.DataFrame(rows)
df.fillna('', inplace=True)

# Encode the target variables
label_encoder_time = LabelEncoder()
df['time_complexity_encoded'] = label_encoder_time.fit_transform(df['time_complexity'])

label_encoder_space = LabelEncoder()
df['space_complexity_encoded'] = label_encoder_space.fit_transform(df['space_complexity'])

# Save the label encoders
joblib.dump(label_encoder_time, '/content/drive/MyDrive/Colab Notebooks/label_encoder_time.pkl')
joblib.dump(label_encoder_space, '/content/drive/MyDrive/Colab Notebooks/label_encoder_space.pkl')

# Vectorize code snippets
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
encodings_cpp = tokenizer(df['code_CPP'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Split the data into training and testing sets for time complexity prediction
X_train_time_cpp, X_test_time_cpp, y_train_time_cpp, y_test_time_cpp = train_test_split(
    encodings_cpp['input_ids'], df['time_complexity_encoded'].values, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for space complexity prediction
X_train_space_cpp, X_test_space_cpp, y_train_space_cpp, y_test_space_cpp = train_test_split(
    encodings_cpp['input_ids'], df['space_complexity_encoded'].values, test_size=0.2, random_state=42
)

# Create datasets for time complexity prediction
train_dataset_time_cpp = TensorDataset(
    X_train_time_cpp,
    encodings_cpp['attention_mask'][:len(X_train_time_cpp)],
    torch.tensor(y_train_time_cpp, dtype=torch.long)
)

test_dataset_time_cpp = TensorDataset(
    X_test_time_cpp,
    encodings_cpp['attention_mask'][len(X_train_time_cpp):],
    torch.tensor(y_test_time_cpp, dtype=torch.long)
)

# Create datasets for space complexity prediction
train_dataset_space_cpp = TensorDataset(
    X_train_space_cpp,
    encodings_cpp['attention_mask'][:len(X_train_space_cpp)],
    torch.tensor(y_train_space_cpp, dtype=torch.long)
)

test_dataset_space_cpp = TensorDataset(
    X_test_space_cpp,
    encodings_cpp['attention_mask'][len(X_train_space_cpp):],
    torch.tensor(y_test_space_cpp, dtype=torch.long)
)

# Define a custom data collator
def custom_data_collator(features):
    batch = {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.stack([f[2] for f in features]),
    }
    return batch

# Define metrics function for accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Initialize the models for time and space complexity prediction
model_time_cpp = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=len(label_encoder_time.classes_)
).to('cuda')  # Move model to GPU

model_space_cpp = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=len(label_encoder_space.classes_)
).to('cuda')  # Move model to GPU

# Define a grid of hyperparameters to search
param_grid = {
    'num_train_epochs': [5, 10],
    'per_device_train_batch_size': [8, 16],
    'learning_rate': [5e-5, 3e-5, 2e-5]
}

# Initialize variables to store the best results
best_accuracy_time = 0
best_accuracy_space = 0
best_params_time = None
best_params_space = None

# Perform hyperparameter tuning
for params in ParameterGrid(param_grid):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=params['num_train_epochs'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        learning_rate=params['learning_rate']
    )

    # Initialize Trainer for time complexity prediction
    trainer_time_cpp = Trainer(
        model=model_time_cpp,
        args=training_args,
        train_dataset=train_dataset_time_cpp,
        eval_dataset=test_dataset_time_cpp,
        data_collator=custom_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model for time complexity
    trainer_time_cpp.train()
    results_time_cpp = trainer_time_cpp.evaluate()
    accuracy_time_cpp = results_time_cpp['eval_accuracy']

    # Update the best results for time complexity
    if accuracy_time_cpp > best_accuracy_time:
        best_accuracy_time = accuracy_time_cpp
        best_params_time = params

    # Initialize Trainer for space complexity prediction
    trainer_space_cpp = Trainer(
        model=model_space_cpp,
        args=training_args,
        train_dataset=train_dataset_space_cpp,
        eval_dataset=test_dataset_space_cpp,
        data_collator=custom_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model for space complexity
    trainer_space_cpp.train()
    results_space_cpp = trainer_space_cpp.evaluate()
    accuracy_space_cpp = results_space_cpp['eval_accuracy']

    # Update the best results for space complexity
    if accuracy_space_cpp > best_accuracy_space:
        best_accuracy_space = accuracy_space_cpp
        best_params_space = params

print(f"Best Time Complexity Accuracy: {best_accuracy_time}")
print(f"Best Time Complexity Params: {best_params_time}")

print(f"Best Space Complexity Accuracy: {best_accuracy_space}")
print(f"Best Space Complexity Params: {best_params_space}")

# Save the best models after training
model_time_cpp.save_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_time_cpp")
model_space_cpp.save_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_space_cpp")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/tokenizer")

# Function to predict time and space complexity
def predict_complexity(code_sample):
    # Tokenize the input code sample
    inputs = tokenizer(code_sample, return_tensors='pt', padding=True, truncation=True).to('cuda')

    # Predict time complexity
    with torch.no_grad():
        outputs_time = model_time_cpp(**inputs)
    predictions_time = torch.argmax(outputs_time.logits, dim=1)
    time_complexity = label_encoder_time.inverse_transform(predictions_time.cpu().numpy())

    # Predict space complexity
    with torch.no_grad():
        outputs_space = model_space_cpp(**inputs)
    predictions_space = torch.argmax(outputs_space.logits, dim=1)
    space_complexity = label_encoder_space.inverse_transform(predictions_space.cpu().numpy())

    return time_complexity[0], space_complexity[0]

# Example usage
code_sample = """
def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
"""

time_complexity, space_complexity = predict_complexity(code_sample)
print(f"Predicted Time Complexity: {time_complexity}")
print(f"Predicted Space Complexity: {space_complexity}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,4.802,4.747822,0.158416
2,4.3368,4.008028,0.207921
3,3.896,3.736701,0.207921
4,3.8961,3.646073,0.207921
5,3.9123,3.62664,0.168317


Epoch,Training Loss,Validation Loss,Accuracy
1,4.0601,3.846943,0.217822
2,3.3236,2.801179,0.356436
3,2.781,2.494963,0.316832
4,3.0276,2.404837,0.29703
5,2.7745,2.314127,0.386139




Epoch,Training Loss,Validation Loss,Accuracy
1,4.1005,3.97044,0.207921
2,4.1177,3.884907,0.207921
3,4.0279,3.803873,0.207921
4,4.0005,3.717773,0.207921


Epoch,Training Loss,Validation Loss,Accuracy
1,2.3565,2.298368,0.405941
2,2.5121,2.296427,0.386139
3,2.49,2.27135,0.435644
4,2.7154,2.39796,0.386139
5,2.2478,2.52425,0.405941




Epoch,Training Loss,Validation Loss,Accuracy
1,4.0316,3.891952,0.207921
2,4.1013,3.762872,0.207921
3,3.7832,3.681849,0.19802
4,3.8441,3.63246,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.6612,2.245971,0.455446
2,2.6726,2.473174,0.346535
3,2.3454,2.296443,0.435644
4,2.4573,2.253924,0.39604




Epoch,Training Loss,Validation Loss,Accuracy
1,3.9873,3.868744,0.207921
2,4.0524,3.810055,0.207921
3,3.969,3.755038,0.207921
4,3.9647,3.691625,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.1751,2.250681,0.465347
2,2.3526,2.272183,0.455446
3,2.3368,2.256029,0.445545
4,2.5997,2.383338,0.425743




Epoch,Training Loss,Validation Loss,Accuracy
1,3.9545,3.836554,0.207921
2,4.0873,3.770232,0.207921
3,3.7994,3.704503,0.19802
4,3.8595,3.650124,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.6229,2.242985,0.465347
2,2.6546,2.317497,0.435644
3,2.2736,2.319615,0.445545
4,2.4653,2.277948,0.415842




Epoch,Training Loss,Validation Loss,Accuracy
1,3.928,3.826536,0.207921
2,4.027,3.796109,0.207921
3,3.9605,3.762162,0.207921
4,3.9773,3.714345,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.1284,2.264513,0.465347


Epoch,Training Loss,Validation Loss,Accuracy
1,2.1284,2.264513,0.465347
2,2.3295,2.269794,0.465347
3,2.3161,2.243595,0.455446
4,2.6066,2.395478,0.435644




Epoch,Training Loss,Validation Loss,Accuracy
1,3.9084,3.800742,0.207921
2,4.069,3.745777,0.207921
3,3.7758,3.691865,0.188119
4,3.8485,3.64449,0.188119


Epoch,Training Loss,Validation Loss,Accuracy
1,2.6039,2.243045,0.475248
2,2.6306,2.336016,0.435644
3,2.2556,2.319252,0.445545
4,2.4393,2.267804,0.39604




Epoch,Training Loss,Validation Loss,Accuracy
1,3.8838,3.791633,0.207921
2,4.0002,3.764937,0.207921
3,3.9316,3.739068,0.207921
4,3.9556,3.700839,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.0912,2.269392,0.465347
2,2.3071,2.267488,0.455446
3,2.2922,2.24489,0.445545
4,2.5789,2.401165,0.445545




Epoch,Training Loss,Validation Loss,Accuracy
1,3.8711,3.774919,0.207921
2,4.0521,3.735209,0.207921
3,3.7819,3.701051,0.19802
4,3.8654,3.666019,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.578,2.24688,0.475248
2,2.6031,2.290069,0.425743
3,2.2407,2.307805,0.465347
4,2.4554,2.298674,0.405941




Epoch,Training Loss,Validation Loss,Accuracy
1,3.8517,3.771371,0.207921
2,3.9858,3.755563,0.207921
3,3.9234,3.737129,0.207921
4,3.9591,3.708943,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,2.061,2.27367,0.465347
2,2.2929,2.261594,0.455446
3,2.2784,2.255998,0.455446
4,2.5719,2.33415,0.475248
5,2.179,2.285336,0.465347




Epoch,Training Loss,Validation Loss,Accuracy
1,3.8463,3.757827,0.207921
2,4.0401,3.720053,0.207921
3,3.7664,3.692249,0.188119
4,3.8518,3.656737,0.188119


Epoch,Training Loss,Validation Loss,Accuracy
1,2.4719,2.245076,0.485149
2,2.5017,2.318864,0.445545
3,2.1573,2.306138,0.455446
4,2.362,2.286188,0.405941




Epoch,Training Loss,Validation Loss,Accuracy
1,3.827,3.754605,0.207921
2,3.9705,3.740555,0.207921
3,3.9072,3.723963,0.207921
4,3.945,3.69882,0.19802


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9665,2.264006,0.475248
2,2.1928,2.260755,0.475248
3,2.1858,2.263792,0.475248
4,2.4693,2.367574,0.445545


Best Time Complexity Accuracy: 0.2079207920792079
Best Time Complexity Params: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8}
Best Space Complexity Accuracy: 0.48514851485148514
Best Space Complexity Params: {'learning_rate': 2e-05, 'num_train_epochs': 10, 'per_device_train_batch_size': 8}
Predicted Time Complexity: O(N)
Predicted Space Complexity: O(1)


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from torch.utils.data import TensorDataset
import joblib
from sklearn.model_selection import ParameterGrid
import numpy as np

# Path to your JSON file in Google Drive
json_file_path = '/content/drive/MyDrive/Colab Notebooks/geeks_for_geeks.json'

# Load JSON data
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Extract relevant information and create a DataFrame
rows = []
for entry in data:
    row = {
        'name': entry.get('name', '').strip(),
        'time_complexity': entry.get('time_complexity', ''),
        'trustable_time_complexity': entry.get('trustable_time_complexity', False),
        'space_complexity': entry.get('space_complexity', ''),
        'trustable_space_complexity': entry.get('trustable_space_complexity', False),
        'code_CPP': entry['codes'].get('C++', {}).get('code', '')
    }
    rows.append(row)

df = pd.DataFrame(rows)
df.fillna('', inplace=True)

# Check class distribution
print("Time Complexity Distribution:\n", df['time_complexity'].value_counts())
print("Space Complexity Distribution:\n", df['space_complexity'].value_counts())

# Handle rare classes
# Example: Combine rare classes into 'Other' or remove them
df['time_complexity'] = df['time_complexity'].apply(lambda x: x if df['time_complexity'].value_counts()[x] > 1 else 'Other')
df['space_complexity'] = df['space_complexity'].apply(lambda x: x if df['space_complexity'].value_counts()[x] > 1 else 'Other')

# Encode the target variables
label_encoder_time = LabelEncoder()
df['time_complexity_encoded'] = label_encoder_time.fit_transform(df['time_complexity'])

label_encoder_space = LabelEncoder()
df['space_complexity_encoded'] = label_encoder_space.fit_transform(df['space_complexity'])

# Save the label encoders
joblib.dump(label_encoder_time, '/content/drive/MyDrive/Colab Notebooks/label_encoder_time.pkl')
joblib.dump(label_encoder_space, '/content/drive/MyDrive/Colab Notebooks/label_encoder_space.pkl')

# Define number of labels for classification tasks
number_of_time_labels = len(label_encoder_time.classes_)
number_of_space_labels = len(label_encoder_space.classes_)

# Vectorize code snippets
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
encodings_cpp = tokenizer(df['code_CPP'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Split the data into training and testing sets for time complexity prediction
X_train_time_cpp, X_test_time_cpp, y_train_time_cpp, y_test_time_cpp = train_test_split(
    encodings_cpp['input_ids'], df['time_complexity_encoded'].values, test_size=0.2, random_state=42, stratify=df['time_complexity_encoded']
)

# Split the data into training and testing sets for space complexity prediction
X_train_space_cpp, X_test_space_cpp, y_train_space_cpp, y_test_space_cpp = train_test_split(
    encodings_cpp['input_ids'], df['space_complexity_encoded'].values, test_size=0.2, random_state=42, stratify=df['space_complexity_encoded']
)

# Create datasets for time complexity prediction
train_dataset_time_cpp = TensorDataset(
    X_train_time_cpp,
    encodings_cpp['attention_mask'][:len(X_train_time_cpp)],
    torch.tensor(y_train_time_cpp, dtype=torch.long)
)

test_dataset_time_cpp = TensorDataset(
    X_test_time_cpp,
    encodings_cpp['attention_mask'][len(X_train_time_cpp):],
    torch.tensor(y_test_time_cpp, dtype=torch.long)
)

# Create datasets for space complexity prediction
train_dataset_space_cpp = TensorDataset(
    X_train_space_cpp,
    encodings_cpp['attention_mask'][:len(X_train_space_cpp)],
    torch.tensor(y_train_space_cpp, dtype=torch.long)
)

test_dataset_space_cpp = TensorDataset(
    X_test_space_cpp,
    encodings_cpp['attention_mask'][len(X_train_space_cpp):],
    torch.tensor(y_test_space_cpp, dtype=torch.long)
)

# Define a custom data collator
def custom_data_collator(features):
    batch = {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.stack([f[2] for f in features]),
    }
    return batch

# Define metrics function for accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy, "report": classification_report(labels, preds, target_names=label_encoder_time.classes_)}

# Initialize the models for time and space complexity prediction
model_time_cpp = RobertaForSequenceClassification.from_pretrained(
    "microsoft/graphcodebert-base", num_labels=number_of_time_labels
).to('cuda')  # Move model to GPU

model_space_cpp = RobertaForSequenceClassification.from_pretrained(
    "microsoft/graphcodebert-base", num_labels=number_of_space_labels
).to('cuda')  # Move model to GPU
# Define a grid of hyperparameters to search
param_grid = {
    'num_train_epochs': [5, 10, 20],
    'per_device_train_batch_size': [8, 16, 32],
    'learning_rate': [5e-5, 3e-5, 2e-5, 1e-5]
}

# Initialize variables to store the best results
best_accuracy_time = 0
best_accuracy_space = 0
best_params_time = None
best_params_space = None

# Perform hyperparameter tuning
for params in ParameterGrid(param_grid):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=params['num_train_epochs'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        learning_rate=params['learning_rate']
    )

    # Initialize Trainer for time complexity prediction
    trainer_time_cpp = Trainer(
        model=model_time_cpp,
        args=training_args,
        train_dataset=train_dataset_time_cpp,
        eval_dataset=test_dataset_time_cpp,
        data_collator=custom_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model for time complexity
    trainer_time_cpp.train()
    results_time_cpp = trainer_time_cpp.evaluate()
    accuracy_time_cpp = results_time_cpp['eval_accuracy']

    # Update the best results for time complexity
    if accuracy_time_cpp > best_accuracy_time:
        best_accuracy_time = accuracy_time_cpp
        best_params_time = params

    # Initialize Trainer for space complexity prediction
    trainer_space_cpp = Trainer(
        model=model_space_cpp,
        args=training_args,
        train_dataset=train_dataset_space_cpp,
        eval_dataset=test_dataset_space_cpp,
        data_collator=custom_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model for space complexity
    trainer_space_cpp.train()
    results_space_cpp = trainer_space_cpp.evaluate()
    accuracy_space_cpp = results_space_cpp['eval_accuracy']

    # Update the best results for space complexity
    if accuracy_space_cpp > best_accuracy_space:
        best_accuracy_space = accuracy_space_cpp
        best_params_space = params

print(f"Best Time Complexity Accuracy: {best_accuracy_time}")
print(f"Best Time Complexity Params: {best_params_time}")

print(f"Best Space Complexity Accuracy: {best_accuracy_space}")
print(f"Best Space Complexity Params: {best_params_space}")

# Save the best models after training
model_time_cpp.save_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_time_cpp")
model_space_cpp.save_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_space_cpp")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/tokenizer")

# Function to predict time and space complexity
def predict_complexity(code_sample):
    # Tokenize the input code sample
    inputs = tokenizer(code_sample, return_tensors='pt', padding=True, truncation=True).to('cuda')

    # Predict time complexity
    with torch.no_grad():
        outputs_time = model_time_cpp(**inputs)
    predictions_time = torch.argmax(outputs_time.logits, dim=1)
    time_complexity = label_encoder_time.inverse_transform(predictions_time.cpu().numpy())

    # Predict space complexity
    with torch.no_grad():
        outputs_space = model_space_cpp(**inputs)
    predictions_space = torch.argmax(outputs_space.logits, dim=1)
    space_complexity = label_encoder_space.inverse_transform(predictions_space.cpu().numpy())

    return time_complexity[0], space_complexity[0]

# Example usage
code_sample = """
def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
"""

time_complexity, space_complexity = predict_complexity(code_sample)
print(f"Predicted Time Complexity: {time_complexity}")
print(f"Predicted Space Complexity: {space_complexity}")

Mounted at /content/drive
Time Complexity Distribution:
 time_complexity
O(N)              89
O(n)              70
O(1)              57
                  18
O(N2)             18
                  ..
O(num * |num|)     1
O(N * logN)        1
O(m log (m+n))     1
O(m+n*logm)        1
O(N * log(N))      1
Name: count, Length: 133, dtype: int64
Space Complexity Distribution:
 space_complexity
O(1)               169
O(N)                88
O(n)                64
                    40
O(V)                10
                  ... 
O(k)                 1
O(N log N)           1
O(d)                 1
O(N + K)             1
O(N * W) + O(N)      1
Name: count, Length: 76, dtype: int64


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


ValueError: Number of classes, 32, does not match size of target_names, 45. Try specifying the labels parameter

In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import json
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import joblib

# Load the tokenizer and models from Google Drive
tokenizer = RobertaTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/tokenizer")
model_time_cpp = RobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_time_cpp").to('cuda')
model_space_cpp = RobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_space_cpp").to('cuda')

# Load the label encoders
label_encoder_time = joblib.load('/content/drive/MyDrive/Colab Notebooks/label_encoder_time.pkl')
label_encoder_space = joblib.load('/content/drive/MyDrive/Colab Notebooks/label_encoder_space.pkl')

# Function to predict time and space complexity
def predict_complexity(code_sample):
    # Tokenize the input code sample
    inputs = tokenizer(code_sample, return_tensors='pt', padding=True, truncation=True).to('cuda')

    # Predict time complexity
    with torch.no_grad():
        outputs_time = model_time_cpp(**inputs)
    predictions_time = torch.argmax(outputs_time.logits, dim=1)
    time_complexity = label_encoder_time.inverse_transform(predictions_time.cpu().numpy())

    # Predict space complexity
    with torch.no_grad():
        outputs_space = model_space_cpp(**inputs)
    predictions_space = torch.argmax(outputs_space.logits, dim=1)
    space_complexity = label_encoder_space.inverse_transform(predictions_space.cpu().numpy())

    return time_complexity[0], space_complexity[0]

# Prompt the user for code input
user_code = input("Please enter your code snippet:\n")

# Predict the complexities for the user-provided code
time_complexity, space_complexity = predict_complexity(user_code)
print(f"Predicted Time Complexity: {time_complexity}")
print(f"Predicted Space Complexity: {space_complexity}")



Mounted at /content/drive
Please enter your code snippet:
#include <bits/stdc++.h> using namespace std;  // Merges two subarrays of arr[]. // First subarray is arr[left..mid] // Second subarray is arr[mid+1..right] void merge(vector<int>& arr, int left,                       int mid, int right) {     int n1 = mid - left + 1;     int n2 = right - mid;      // Create temp vectors     vector<int> L(n1), R(n2);      // Copy data to temp vectors L[] and R[]     for (int i = 0; i < n1; i++)         L[i] = arr[left + i];     for (int j = 0; j < n2; j++)         R[j] = arr[mid + 1 + j];      int i = 0, j = 0;     int k = left;      // Merge the temp vectors back      // into arr[left..right]     while (i < n1 && j < n2) {         if (L[i] <= R[j]) {             arr[k] = L[i];             i++;         }         else {             arr[k] = R[j];             j++;         }         k++;     }      // Copy the remaining elements of L[],      // if there are any     while (i < n1) {         arr[k] =

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from torch.utils.data import TensorDataset
import joblib

# Load existing and new JSON data
with open('/content/drive/MyDrive/Colab Notebooks/geeks_for_geeks.json', 'r') as file:
    existing_data = json.load(file)

with open('/content/drive/MyDrive/Colab Notebooks/new_data.json', 'r') as file:
    new_data = json.load(file)

# Combine existing and new data
data = existing_data + new_data

# Extract relevant information and create a DataFrame
rows = []
for entry in data:
    row = {
        'name': entry.get('name', '').strip(),
        'time_complexity': entry.get('time_complexity', ''),
        'trustable_time_complexity': entry.get('trustable_time_complexity', False),
        'space_complexity': entry.get('space_complexity', ''),
        'trustable_space_complexity': entry.get('trustable_space_complexity', False),
        'code_CPP': entry['codes'].get('C++', {}).get('code', '')
    }
    rows.append(row)

df = pd.DataFrame(rows)
df.fillna('', inplace=True)

# Load and update existing label encoders
label_encoder_time = joblib.load('/content/drive/MyDrive/Colab Notebooks/label_encoder_time.pkl')
label_encoder_space = joblib.load('/content/drive/MyDrive/Colab Notebooks/label_encoder_space.pkl')

label_encoder_time.fit(df['time_complexity'])
label_encoder_space.fit(df['space_complexity'])

# Encode the target variables
df['time_complexity_encoded'] = label_encoder_time.transform(df['time_complexity'])
df['space_complexity_encoded'] = label_encoder_space.transform(df['space_complexity'])

# Save the updated label encoders
joblib.dump(label_encoder_time, '/content/drive/MyDrive/Colab Notebooks/label_encoder_time.pkl')
joblib.dump(label_encoder_space, '/content/drive/MyDrive/Colab Notebooks/label_encoder_space.pkl')

# Vectorize code snippets
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
encodings_cpp = tokenizer(df['code_CPP'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Split the data into training and testing sets for time complexity prediction
X_train_time_cpp, X_test_time_cpp, y_train_time_cpp, y_test_time_cpp = train_test_split(
    encodings_cpp['input_ids'], df['time_complexity_encoded'].values, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for space complexity prediction
X_train_space_cpp, X_test_space_cpp, y_train_space_cpp, y_test_space_cpp = train_test_split(
    encodings_cpp['input_ids'], df['space_complexity_encoded'].values, test_size=0.2, random_state=42
)

# Create datasets for time complexity prediction
train_dataset_time_cpp = TensorDataset(
    X_train_time_cpp,
    encodings_cpp['attention_mask'][:len(X_train_time_cpp)],
    torch.tensor(y_train_time_cpp, dtype=torch.long)
)

test_dataset_time_cpp = TensorDataset(
    X_test_time_cpp,
    encodings_cpp['attention_mask'][len(X_train_time_cpp):],
    torch.tensor(y_test_time_cpp, dtype=torch.long)
)

# Create datasets for space complexity prediction
train_dataset_space_cpp = TensorDataset(
    X_train_space_cpp,
    encodings_cpp['attention_mask'][:len(X_train_space_cpp)],
    torch.tensor(y_train_space_cpp, dtype=torch.long)
)

test_dataset_space_cpp = TensorDataset(
    X_test_space_cpp,
    encodings_cpp['attention_mask'][len(X_train_space_cpp):],
    torch.tensor(y_test_space_cpp, dtype=torch.long)
)

# Define a custom data collator
def custom_data_collator(features):
    batch = {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.stack([f[2] for f in features]),
    }
    return batch

# Define metrics function for accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

# Load or initialize the model for time complexity prediction
model_time_cpp = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=len(label_encoder_time.classes_)
)

# Load or initialize the model for space complexity prediction
model_space_cpp = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=len(label_encoder_space.classes_)
)

# Initialize Trainer for time complexity prediction
trainer_time_cpp = Trainer(
    model=model_time_cpp,
    args=training_args,
    train_dataset=train_dataset_time_cpp,
    eval_dataset=test_dataset_time_cpp,
    data_collator=custom_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Initialize Trainer for space complexity prediction
trainer_space_cpp = Trainer(
    model=model_space_cpp,
    args=training_args,
    train_dataset=train_dataset_space_cpp,
    eval_dataset=test_dataset_space_cpp,
    data_collator=custom_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the models
trainer_time_cpp.train()
trainer_space_cpp.train()

# Save the models after training
model_time_cpp.save_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_time_cpp")
model_space_cpp.save_pretrained("/content/drive/MyDrive/Colab Notebooks/best_model_space_cpp")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/tokenizer")

# Evaluate the models
results_time_cpp = trainer_time_cpp.evaluate()
results_space_cpp = trainer_space_cpp.evaluate()

print(f"Time Complexity Prediction Results (C++): {results_time_cpp}")
print(f"Space Complexity Prediction Results (C++): {results_space_cpp}")


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,4.7706,4.680598,0.118812
2,4.1853,4.021014,0.207921
3,3.8998,3.781063,0.217822
4,4.0317,3.728506,0.188119
5,3.8032,3.679388,0.207921


Epoch,Training Loss,Validation Loss,Accuracy
1,4.0376,3.973434,0.29703
2,2.8957,2.986118,0.326733
3,2.6778,2.811381,0.326733
4,2.7592,2.767263,0.326733
5,2.662,2.754905,0.346535


Time Complexity Prediction Results (C++): {'eval_loss': 3.7810630798339844, 'eval_accuracy': 0.21782178217821782, 'eval_runtime': 2.9149, 'eval_samples_per_second': 34.65, 'eval_steps_per_second': 2.401, 'epoch': 5.0}
Space Complexity Prediction Results (C++): {'eval_loss': 2.7549045085906982, 'eval_accuracy': 0.3465346534653465, 'eval_runtime': 2.8519, 'eval_samples_per_second': 35.415, 'eval_steps_per_second': 2.454, 'epoch': 5.0}


In [1]:
from google.colab import drive
drive.mount('/content/drive')

import json
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from torch.utils.data import TensorDataset
import joblib
from sklearn.model_selection import ParameterGrid
import numpy as np

# Path to your JSON file in Google Drive
json_file_path = '/content/drive/MyDrive/Colab Notebooks/geeks_for_geeks.json'

# Load JSON data
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Extract relevant information and create a DataFrame
rows = []
for entry in data:
    row = {
        'name': entry.get('name', '').strip(),
        'time_complexity': entry.get('time_complexity', ''),
        'trustable_time_complexity': entry.get('trustable_time_complexity', False),
        'space_complexity': entry.get('space_complexity', ''),
        'trustable_space_complexity': entry.get('trustable_space_complexity', False),
        'code_CPP': entry['codes'].get('C++', {}).get('code', '')
    }
    rows.append(row)

df = pd.DataFrame(rows)
df.fillna('', inplace=True)

# Encode the target variables
label_encoder_time = LabelEncoder()
df['time_complexity_encoded'] = label_encoder_time.fit_transform(df['time_complexity'])

label_encoder_space = LabelEncoder()
df['space_complexity_encoded'] = label_encoder_space.fit_transform(df['space_complexity'])

# Save the label encoders
joblib.dump(label_encoder_time, '/content/drive/MyDrive/Colab Notebooks/label_encoder_time.pkl')
joblib.dump(label_encoder_space, '/content/drive/MyDrive/Colab Notebooks/label_encoder_space.pkl')

# Vectorize code snippets
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
encodings_cpp = tokenizer(df['code_CPP'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Define cross-validation and parameter grid
kf = KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'num_train_epochs': [5, 10],
    'per_device_train_batch_size': [8, 16],
    'learning_rate': [5e-5, 3e-5, 2e-5]
}

# Initialize variables to store the best results
best_accuracy_time = 0
best_accuracy_space = 0
best_params_time = None
best_params_space = None

# Custom data collator
def custom_data_collator(features):
    batch = {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.stack([f[2] for f in features]),
    }
    return batch

# Metrics function for accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Perform grid search with cross-validation
for params in ParameterGrid(param_grid):
    accuracies_time = []
    accuracies_space = []

    for train_index, val_index in kf.split(encodings_cpp['input_ids']):
        X_train, X_val = encodings_cpp['input_ids'][train_index], encodings_cpp['input_ids'][val_index]
        y_train_time, y_val_time = df['time_complexity_encoded'].values[train_index], df['time_complexity_encoded'].values[val_index]
        y_train_space, y_val_space = df['space_complexity_encoded'].values[train_index], df['space_complexity_encoded'].values[val_index]

        train_dataset_time = TensorDataset(X_train, encodings_cpp['attention_mask'][train_index], torch.tensor(y_train_time, dtype=torch.long))
        val_dataset_time = TensorDataset(X_val, encodings_cpp['attention_mask'][val_index], torch.tensor(y_val_time, dtype=torch.long))

        train_dataset_space = TensorDataset(X_train, encodings_cpp['attention_mask'][train_index], torch.tensor(y_train_space, dtype=torch.long))
        val_dataset_space = TensorDataset(X_val, encodings_cpp['attention_mask'][val_index], torch.tensor(y_val_space, dtype=torch.long))

        model_time = RobertaForSequenceClassification.from_pretrained(
            "microsoft/graphcodebert-base", num_labels=len(label_encoder_time.classes_)
        ).to('cuda')

        model_space = RobertaForSequenceClassification.from_pretrained(
            "microsoft/graphcodebert-base", num_labels=len(label_encoder_space.classes_)
        ).to('cuda')

        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=params['num_train_epochs'],
            per_device_train_batch_size=params['per_device_train_batch_size'],
            per_device_eval_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            learning_rate=params['learning_rate']
        )

        trainer_time = Trainer(
            model=model_time,
            args=training_args,
            train_dataset=train_dataset_time,
            eval_dataset=val_dataset_time,
            data_collator=custom_data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        trainer_space = Trainer(
            model=model_space,
            args=training_args,
            train_dataset=train_dataset_space,
            eval_dataset=val_dataset_space,
            data_collator=custom_data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        # Train and evaluate for time complexity
        trainer_time.train()
        results_time = trainer_time.evaluate()
        accuracies_time.append(results_time['eval_accuracy'])

        # Train and evaluate for space complexity
        trainer_space.train()
        results_space = trainer_space.evaluate()
        accuracies_space.append(results_space['eval_accuracy'])

    # Calculate average accuracy for current set of parameters
    avg_accuracy_time = np.mean(accuracies_time)
    avg_accuracy_space = np.mean(accuracies_space)

    if avg_accuracy_time > best_accuracy_time:
        best_accuracy_time = avg_accuracy_time
        best_params_time = params

    if avg_accuracy_space > best_accuracy_space:
        best_accuracy_space = avg_accuracy_space
        best_params_space = params

print(f"Best Time Complexity Accuracy: {best_accuracy_time}")
print(f"Best Time Complexity Params: {best_params_time}")

print(f"Best Space Complexity Accuracy: {best_accuracy_space}")
print(f"Best Space Complexity Params: {best_params_space}")

# Retrain on the full dataset with best parameters for time complexity
training_args_best_time = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_params_time['num_train_epochs'],
    per_device_train_batch_size=best_params_time['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    learning_rate=best_params_time['learning_rate']
)

model_time_best = RobertaForSequenceClassification.from_pretrained(
    "microsoft/graphcodebert-base", num_labels=len(label_encoder_time.classes_)
).to('cuda')

train_dataset_full_time = TensorDataset(
    encodings_cpp['input_ids'], encodings_cpp['attention_mask'], torch.tensor(df['time_complexity_encoded'].values, dtype=torch.long)
)

trainer_best_time = Trainer(
    model=model_time_best,
    args=training_args_best_time,
    train_dataset=train_dataset_full_time,
    data_collator=custom_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer_best_time.train()

# Retrain on the full dataset with best parameters for space complexity
training_args_best_space = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_params_space['num_train_epochs'],
    per_device_train_batch_size=best_params_space['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    learning_rate=best_params_space['learning_rate']
)

model_space_best = RobertaForSequenceClassification.from_pretrained(
    "microsoft/graphcodebert-base", num_labels=len(label_encoder_space.classes_)
).to('cuda')

train_dataset_full_space = TensorDataset(
    encodings_cpp['input_ids'], encodings_cpp['attention_mask'], torch.tensor(df['space_complexity_encoded'].values, dtype=torch.long)
)

trainer_best_space = Trainer(
    model=model_space_best,
    args=training_args_best_space,
    train_dataset=train_dataset_full_space,
    data_collator=custom_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer_best_space.train()

# Save the best models
trainer_best_time.save_model('/content/drive/MyDrive/Colab Notebooks/best_model_time_complexity')
trainer_best_space.save_model('/content/drive/MyDrive/Colab Notebooks/best_model_space_complexity')

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx