In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv('Code_Comment_Seed_Data.csv')

# Preprocess the data
le = LabelEncoder()
df['class_label'] = le.fit_transform(df['Class'])

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

# Tokenize the comments and code context
max_words = 10000  # adjust based on your data
max_sequence_length = 100  # adjust based on your data

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['Comments'] + ' ' + train_data['Surrounding Code Context'])
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_data['Comments'] + ' ' + train_data['Surrounding Code Context'])
test_sequences = tokenizer.texts_to_sequences(test_data['Comments'] + ' ' + test_data['Surrounding Code Context'])

train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the model
embedding_dim = 100  # adjust based on your data
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
labels_train = train_data['class_label']
model.fit(train_padded, labels_train, epochs=5, validation_split=0.2)

# Evaluate the model
labels_test = test_data['class_label']
test_loss, test_accuracy = model.evaluate(test_padded, labels_test)

# Make predictions
predictions = model.predict(test_padded)
predicted_labels = [1 if pred > 0.5 else 0 for pred in predictions]

# Decode labels back to original class names
predicted_class_names = le.inverse_transform(predicted_labels)
actual_class_names = le.inverse_transform(labels_test)

# Display the classification report
print(classification_report(actual_class_names, predicted_class_names))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

  Not Useful       0.59      0.80      0.68      1316
      Useful       0.84      0.66      0.74      2120

    accuracy                           0.71      3436
   macro avg       0.72      0.73      0.71      3436
weighted avg       0.75      0.71      0.72      3436



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ast
import tokenize
from io import BytesIO

# Load your dataset
file_path = "Code_Comment_Seed_Data.csv"
df = pd.read_csv(file_path)

# Preprocess the data
X_code = df['Surrounding Code Context']
X_comment = df['Comments']
y = df['Class']

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(y)

# Tokenize the code
def tokenize_code(code):
    try:
        code_tokens = tokenize.tokenize(BytesIO(code.encode('utf-8')).readline)
        return ' '.join([token.string for token in code_tokens if token.type != tokenize.COMMENT])
    except Exception as e:
        print(f"Error tokenizing code: {e}")
        return ''

X_code = X_code.apply(tokenize_code)

# Combine code and comments
X = X_code + ' ' + X_comment

# Tokenize the text
max_words = 10000  # Adjust as needed
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Build the model
embedding_dim = 16  # Adjust as needed
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=X_padded.shape[1]),
    LSTM(100),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

# Make predictions on new input
def predict_usefulness(new_code, new_comment):
    new_code_tokens = tokenize_code(new_code)
    new_input = [new_code_tokens + ' ' + new_comment]
    new_sequences = tokenizer.texts_to_sequences(new_input)
    new_padded = pad_sequences(new_sequences, maxlen=X_padded.shape[1])
    prediction = model.predict(new_padded)
    return "Useful" if prediction > 0.5 else "Not Useful"

# Example usage
new_code_input = "for(int i=0;i<n;i++)cin>>arr[i]"
new_comment_input = "this is an input for an array function"
result = predict_usefulness(new_code_input, new_comment_input)
print(f'The input is predicted to be: {result}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Error tokenizing code: ('EOF in multi-line statement', (8, 0))
Error tokenizing code: ('EOF in multi-line statement', (6, 0))
Error tokenizing code: ('EOF in multi-line statement', (6, 0))
Error tokenizing code: ('EOF in multi-line statement', (6, 0))
Error tokenizing code: ('EOF in multi-line statement', (10, 0))
Error tokenizing code: ('EOF in multi-line statement', (5, 0))
Error tokenizing code: ('EOF in multi-line statement', (11, 0))
Error tokenizing code: ('EOF in multi-line statement', (11, 0))
Error tokenizing code: ('EOF in multi-line statement', (13, 0))
Error tokenizing code: ('EOF in multi-line statement', (13, 0))
Error tokenizing code: ('EOF in multi-line statement', (13, 0))
Error tokenizing code: ('EOF in multi-line statement', (13, 0))
Error tokenizing code: ('EOF in multi-line statement', (9, 0))
Error tokenizing code: ('EOF in multi-line statement', (9, 0))
Error tokenizing code: ('EOF in multi-line sta

In [None]:
new_code_input = "for(int i=0;i<n;i++)cin>>arr[i]"
new_comment_input = "aandu paandu bakchodi of array"
result = predict_usefulness(new_code_input, new_comment_input)
print(f'The input is predicted to be: {result}')

The input is predicted to be: Useful


In [None]:
new_code_input = "for(int i=0;i<n;i++)cin>>arr[i]"
new_comment_input = "aandu paandu bakchodi"
result = predict_usefulness(new_code_input, new_comment_input)
print(f'The input is predicted to be: {result}')

The input is predicted to be: Not Useful


In [None]:
new_code_input = "for(int i=0;i<n;i++)cin>>arr[i]"
new_comment_input = "array"
result = predict_usefulness(new_code_input, new_comment_input)
print(f'The input is predicted to be: {result}')

The input is predicted to be: Useful


In [None]:
new_code_input = "for(int i=0;i<n;i++)cin>>arr[i]"
new_comment_input = "insertion"
result = predict_usefulness(new_code_input, new_comment_input)
print(f'The input is predicted to be: {result}')

The input is predicted to be: Not Useful


In [None]:
new_code_input = "for(int i=0;i<n;i++)cin>>arr[i]"
new_comment_input = "insertion in vector"
result = predict_usefulness(new_code_input, new_comment_input)
print(f'The input is predicted to be: {result}')

The input is predicted to be: Not Useful


In [None]:
new_code_input = "#include <stdio.h>int main() {int n, i, flag = 0;printf("Enter a positive integer: ");scanf("%d", &n);if (n == 0 || n == 1){flag = 1;}for (i = 2; i <= n / 2; ++i) {    if (n % i == 0) {flag = 1;break;}}if (flag == 0){printf("%d is a prime number.", n);}else{printf("%d is not a prime number.", n);}

  return 0;
}"