In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_text(text):
    # Check for null values
    if not text or pd.isnull(text):
        return ''

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [word.lower() for word in tokens]

    # Removing punctuation and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ','.join(tokens)

In [None]:
# Load the data
file_path = '/content/drive/MyDrive/BugSum-master/excel sheet/output.csv'
df = pd.read_csv(file_path)

In [None]:
# Apply preprocessing to each sentence
df['Processed_Sentence'] = df['Sentence'].apply(preprocess_text)

In [None]:
# Prepare sentences for Word2Vec model training
processed_sentences = df['Processed_Sentence'].tolist()


In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=processed_sentences, vector_size=100, window=5, min_count=1, workers=4)



In [None]:
# Sentence embedding function
def sentence_embedding(sentence, model):
    if not sentence:
        return np.zeros(model.vector_size)
    embeddings = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

In [None]:
# Apply sentence embedding
df['Sentence_Embeddings'] = df['Processed_Sentence'].apply(lambda x: sentence_embedding(x, word2vec_model))


In [None]:
# Save the processed data to a CSV file
output_file_path = '/content/drive/MyDrive/BugSum-master/excel sheet/processed_embeddings.csv'
df.to_csv(output_file_path, index=False)