In [1]:
import warnings
# Settings the warnings to be ignored
warnings.filterwarnings('ignore')

In [2]:
import openai
# load and set our key
openai.api_key = open("key.txt", "r").read().strip("\n")

In [3]:
# imports libraries
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding

In [4]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this is the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [5]:
# load & inspect dataset
input_datapath = "Code_Review.csv"  # to save space, provide a pre-filtered dataset
Code_df = pd.read_csv(input_datapath, index_col=0)
Code_df = Code_df.dropna()
Code_df["combined"] = (Code_df['Comment by Developer A'].str.strip() + Code_df['Comment by Developer B'].str.strip())
Code_df.head()

Unnamed: 0_level_0,Title,File,Code Changes,Comment by Developer A,Comment by Developer B,Label,combined
Code Review,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CR123,Bug Fix in Authentication Module,auth_utils.py,Fixed a bug in the authentication logic ...,Looks good to me!,"I agree, the fix seems correct.",Normal,"Looks good to me!I agree, the fix seems correct."
CR124,Feature Enhancement: User Profile Image Upload,user_profile.py,Added functionality to upload profile images ...,Nice addition!,This will improve user experience.,Normal,Nice addition!This will improve user experience.
CR125,Refactoring: Database Connections,database.py,Reorganized database connection code for bett...,Clean and organized!,Makes the code easier to understand.,Normal,Clean and organized!Makes the code easier to u...
CR126,API Endpoint Bug Fix,api_endpoints.py,Fixed a bug causing incorrect response in API...,Bug squashed!,"Good catch, thank you.",Normal,"Bug squashed!Good catch, thank you."
CR127,Performance Optimization: Cache Implementation,caching.py,Implemented caching mechanism for improved pe...,Performance boost!,Cache will speed up responses.,Normal,Performance boost!Cache will speed up responses.


In [6]:
encoding = tiktoken.get_encoding(embedding_encoding)
# omit reviews that are too long to embed
Code_df["n_tokens"] = Code_df.combined.apply(lambda x: len(encoding.encode(x)))
Code_df = Code_df[Code_df.n_tokens <= max_tokens]
len(Code_df)

200

In [7]:
Code_df["embedding"] = Code_df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
Code_df.to_csv("code_reviews_with_embeddings_.csv")

### <font color='blue'> Zero-Shot Classification

<font color='purple'>To predict labels of samples without any training embed short descriptions of each label, such as Normal and anamoly, and then compare the cosine distance between embeddings of samples and label descriptions. The highest similarity label to the sample input is the predicted label

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from openai.embeddings_utils import get_embedding
from sklearn.preprocessing import LabelEncoder

# Load the embeddings CSV file
embedding_datapath = "code_reviews_with_embeddings_.csv"
embeddings_df = pd.read_csv(embedding_datapath, index_col=0)

# Load your dataset and split it into features (X) and labels (y)
# features are 'embedding' and labels are 'Label'
X = embeddings_df['embedding'].apply(eval).tolist() 
y = embeddings_df['Label']

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.ensemble import IsolationForest
# Train the Isolation Forest model
clf = IsolationForest(contamination=0.05, random_state=42)
clf.fit(X_train)

# Define a function to predict anomalies
def predict_anomaly_score(embedding):
    anomaly_score = clf.decision_function([embedding])[0]
    return anomaly_score

# Example usage
user_combined_review = "This code change breaks the entire application functionality."
user_embedding = get_embedding(user_combined_review, engine=embedding_model)
user_anomaly_score = predict_anomaly_score(user_embedding)

if user_anomaly_score < 0:
    print("Anomaly Detected!")
else:
    print("Normal")

Anomaly Detected!


### <font color='blue'> With Gradio Interface

In [10]:
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from openai.embeddings_utils import get_embedding

# Load the embeddings CSV file
embedding_datapath = "code_reviews_with_embeddings_.csv"
embeddings_df = pd.read_csv(embedding_datapath, index_col=0)

# Load your dataset and features (X), assuming 'embedding' is the feature column
X = embeddings_df['embedding'].apply(eval).tolist()

# Train the Isolation Forest model
clf = IsolationForest(contamination=0.05, random_state=42)
clf.fit(X)

def predict_anomaly_score(embedding):
    anomaly_score = clf.decision_function([embedding])[0]
    return anomaly_score

def anomaly_detection(review):
    embedding = get_embedding(review, engine=embedding_model)
    anomaly_score = predict_anomaly_score(embedding)
    
    if anomaly_score < 0:
        result = "Anomaly Detected!"
    else:
        result = "Normal"
    return result

iface = gr.Interface(
    fn=anomaly_detection,
    inputs=gr.inputs.Textbox(lines=2, label="Enter Combined Review"),
    outputs=gr.outputs.Textbox(label="Review Status"))

iface.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


