In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import numpy as np




In [9]:
# Load dataset
data = pd.read_csv('sample.csv')
data = data[['description','score']]
data['score'] = data['score'].apply(lambda x: x[8:11])

# Clean the 'score' column by removing non-numeric values
data['score'] = data['score'].apply(lambda x: x if x.replace('.', '', 1).isdigit() else None)

# Convert 'score' column to float, handling errors
data['score'] = pd.to_numeric(data['score'], errors='coerce')

# Round 'score' column to integers
data['score'] = data['score'].round().astype('Int64')


data

Unnamed: 0,description,score
0,Improper Restriction of Excessive Authenticati...,4
1,"Deserialization of Untrusted Data, Improper In...",9
2,An issue was discovered in Samsung Mobile Proc...,8
3,An issue was discovered in Samsung Mobile Proc...,8
4,Cross Site Scripting vulnerability in timetec ...,5
...,...,...
10638,confirmed,9
10639,Chat functionality in Schoolbox application be...,9
10640,News functionality in Schoolbox application be...,7
10641,Class functionality in Schoolbox application \...,7


In [10]:
# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
    return ' '.join(tokens)

data['description'] = data['description'].apply(preprocess_text)
data

Unnamed: 0,description,score
0,Improper Restriction Excessive Authentication ...,4
1,"Deserialization Untrusted Data, Improper Input...",9
2,"issue discovered Samsung Mobile Processor, Wea...",8
3,"issue discovered Samsung Mobile Processor, Wea...",8
4,Cross Site Scripting vulnerability timetec AWD...,5
...,...,...
10638,confirmed,9
10639,Chat functionality Schoolbox application versi...,9
10640,News functionality Schoolbox application versi...,7
10641,Class functionality Schoolbox application vers...,7


In [11]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['description'])
y = data['score']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert sparse matrix to NumPy array
X_train = X_train.toarray()

# Convert labels to numerical format
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [12]:
# Model Building
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


InvalidArgumentError: {{function_node __wrapped__SerializeManySparse_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[1] = [0,8412] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse] name: 