In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/BugSum-master/excel sheet/processed_embeddings.csv')

# Robust conversion of 'Sentence_Embeddings' from string representations to numpy arrays
def convert_embeddings(embeddings_str):
    # Split the string by spaces and convert each number to float
    embeddings_list = embeddings_str.strip('[]').split()
    return np.array([float(e) for e in embeddings_list if e], dtype=np.float32)

df['Sentence_Embeddings'] = df['Sentence_Embeddings'].apply(convert_embeddings)

# Define a function to label sentences based on heuristic rules
def label_sentence(sentence):
    important_keywords = ['fixed', 'resolved', 'correct', 'implemented']
    not_important_keywords = ['issue', 'problem', 'incorrect', 'not working', 'failed']
    sentence_lower = sentence.lower()
    if any(keyword in sentence_lower for keyword in important_keywords):
        return 0  # 'important'
    elif any(keyword in sentence_lower for keyword in not_important_keywords):
        return 2  # 'not important'
    else:
        return 1  # 'medium important'

# Apply the labeling function to each sentence
df['ApprovalCategory'] = df['Sentence'].apply(label_sentence)

# Prepare data for SVM
X = np.vstack(df['Sentence_Embeddings'].values)
y = df['ApprovalCategory'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train an SVM classifier
svm_clf = SVC(kernel='linear', decision_function_shape='ovo', probability=True, random_state=42)
svm_clf.fit(X_train_scaled, y_train)

# Predict class probabilities
probabilities = svm_clf.predict_proba(X_test_scaled)

# Use the probability of the "not important" class as the believability score
believability_scores = probabilities[:, 2]

# Optionally, evaluate the model
y_pred = svm_clf.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Append the believability scores to the dataframe
#df['Believability Score'] = believability_scores
print("Sample believability scores:", believability_scores[:10])

# Save the updated dataframe to a new CSV file
#df.to_csv('/content/drive/MyDrive/BugSum-master/excel sheet/scores.csv', index=False)

Accuracy: 0.9095816464237517
Sample believability scores: [0.04831069 0.07888245 0.0670999  0.06840869 0.01441611 0.04009848
 0.03701878 0.02327576 0.05239211 0.06861062]


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/BugSum-master/excel sheet/processed_embeddings.csv')

# Robust conversion of 'Sentence_Embeddings' from string representations to numpy arrays
def convert_embeddings(embeddings_str):
    # Split the string by spaces and convert each number to float
    embeddings_list = embeddings_str.strip('[]').split()
    return np.array([float(e) for e in embeddings_list if e], dtype=np.float32)

df['Sentence_Embeddings'] = df['Sentence_Embeddings'].apply(convert_embeddings)

# Define a function to label sentences based on heuristic rules
def label_sentence(sentence):
    important_keywords = ['fixed', 'resolved', 'correct', 'implemented']
    not_important_keywords = ['issue', 'problem', 'incorrect', 'not working', 'failed']
    sentence_lower = sentence.lower()
    if any(keyword in sentence_lower for keyword in important_keywords):
        return 0  # 'important'
    elif any(keyword in sentence_lower for keyword in not_important_keywords):
        return 2  # 'not important'
    else:
        return 1  # 'medium important'

# Apply the labeling function to each sentence
df['ApprovalCategory'] = df['Sentence'].apply(label_sentence)

# Prepare data for SVM
X = np.vstack(df['Sentence_Embeddings'].values)
y = df['ApprovalCategory'].values

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train an SVM classifier
svm_clf = SVC(kernel='linear', decision_function_shape='ovo', probability=True, random_state=42)
svm_clf.fit(X_train_scaled, y_train)

# Scale the entire dataset features
X_scaled = scaler.fit_transform(X)

# Predict class probabilities for the entire dataset
probabilities = svm_clf.predict_proba(X_scaled)

# Use the probability of the "not important" class as the believability score for the entire dataset
believability_scores = probabilities[:, 2]

# Optionally, evaluate the model
y_pred = svm_clf.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Append the believability scores to the dataframe
df['Believability Score'] = believability_scores
print("Sample believability scores:", believability_scores[:10])

# Save the updated dataframe to a new CSV file
df.to_csv('/content/drive/MyDrive/BugSum-master/excel sheet/scores.csv', index=False)

Accuracy: 0.9095816464237517
Sample believability scores: [0.05778307 0.07718918 0.01977399 0.06685629 0.07290404 0.04423179
 0.04749297 0.05652807 0.0506163  0.01960746]
