# Readability Prediction

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/code-snippets-insights-and-readability/data_cpp.csv
/kaggle/input/code-snippets-insights-and-readability/data_python.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [3]:
df1 = pd.read_csv("/kaggle/input/code-snippets-insights-and-readability/data_python.csv")

In [4]:
df1["python_solutions"] = [x[20:] for x in df1["python_solutions"]]

In [5]:
X = df1[['num_of_lines', 'code_length', 'comments', 'cyclomatic_complexity', 'indents', 'loop_count', 'identifiers']]
y = df1['readability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)

In [6]:
import sklearn

print("Scikit-learn version:", sklearn.__version__)

Scikit-learn version: 1.2.2


In [7]:
model_read = make_pipeline(RandomForestRegressor(n_estimators=50, random_state=0))
model_read.fit(X_train, y_train)

In [8]:
y_pred = model_read.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.29876820914776364


In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

# Creating and training the model
model_read = make_pipeline(RandomForestRegressor(n_estimators=50, random_state=0))
model_read.fit(X_train, y_train)

# Saving the model to a file
dump(model_read, 'random_forest_model.joblib')

['random_forest_model.joblib']

# Task Complexity Regression

In [3]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

2024-05-12 15:24:39.812731: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-12 15:24:39.812829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-12 15:24:39.927912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
df2 = pd.read_csv("/kaggle/input/code-snippets-insights-and-readability/data_python.csv")

In [5]:
df2["python_solutions"] = [x[20:] for x in df2["python_solutions"]]

In [6]:
difficulty_dict = {
    "Easy": 1,
    "Medium": 2,
    "Hard": 3
}

df2["difficulty"] = [difficulty_dict[x] for x in df2["difficulty"]]

In [7]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = TFRobertaModel.from_pretrained("microsoft/codebert-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/499M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [8]:
def create_model():
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32)  # Assuming fixed input sequence length of 128
    roberta_model = TFRobertaModel.from_pretrained("roberta-base")
    embedding_layer = roberta_model(input_ids)[0]  # Extract embeddings from RoBERTa model
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(embedding_layer)
    output = tf.keras.layers.Dense(1)(pooled_output)
    model_output = tf.keras.Model(inputs=input_ids, outputs=output)
    return model_output

In [9]:
X = df2['python_solutions']
y = df2['difficulty']

In [10]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import numpy as np

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
codebert_model = TFAutoModel.from_pretrained(model_name)

# Sample data (array of Python code strings and corresponding labels)
code_samples = X
labels = y

# Tokenize input code samples
input_ids = []
attention_masks = []
for code in code_samples:
    encoded = tokenizer(code, padding='max_length', truncation=True, max_length=128, return_tensors='tf')
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

# Convert lists to arrays
input_ids = np.array([np.squeeze(input_id) for input_id in input_ids])
attention_masks = np.array([np.squeeze(mask) for mask in attention_masks])
labels = np.array(labels)

# Split data into train and validation sets
train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42)

# Define the neural network architecture
input_ids_input = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_masks_input = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_masks")

codebert_output = codebert_model(input_ids_input, attention_mask=attention_masks_input)[0]
pooled_output = tf.keras.layers.GlobalAveragePooling1D()(codebert_output)
output = tf.keras.layers.Dense(1, activation='linear')(pooled_output)  # Linear activation for regression

# Define the model
model = tf.keras.Model(inputs=[input_ids_input, attention_masks_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])

# Train the model
model.fit([train_input_ids, train_attention_masks], train_labels, epochs=1, batch_size=32, validation_data=([val_input_ids, val_attention_masks], val_labels))


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
I0000 00:00:1715527588.232975     115 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




<tf_keras.src.callbacks.History at 0x7ef41ac8c8e0>

In [None]:
# Save the model in HDF5 format
model.save("codebert_model.h5")

tokenizer.save_pretrained('codebert_tokenizer')