In [19]:
import pandas as pd

from pygments.lexers import get_lexer_by_name
from pygments.token import Token

from math import log2

# Load Data and Split into Train and Test Splits

In [2]:
student_dataset = pd.read_csv("student_dataset.csv")
student_dataset.head(3)

Unnamed: 0,File Name,Method Name,Method Code XML,Method Java,Method Java Formatted
0,./GitHub Repos/twitter4j/twitter4j/twitter4j-c...,createRelationshipList,"<function pos:start=""81:5"" pos:end=""104:5""><ty...",static ResponseList < Relationship > creat...,static ResponseList<Relationship> createRelati...
1,./GitHub Repos/twitter4j/twitter4j/twitter4j-c...,isSourceBlockingTarget,"<function pos:start=""117:5"" pos:end=""120:5""><a...",@ Override \n public boolean isSourceBlock...,@Override\n public boolean isSourceBlockingTa...
2,./GitHub Repos/twitter4j/twitter4j/twitter4j-c...,isSourceFollowingTarget,"<function pos:start=""132:5"" pos:end=""135:5""><a...",@ Override \n public boolean isSourceFollo...,@Override\n public boolean isSourceFollowingT...


In [87]:
# Select 100 methods to serve as the test set
test_set = student_dataset.sample(n=100, random_state=12)

# Remove the methods from the training set
student_training_set = student_dataset.drop(test_set.index).reset_index(drop=True)

# Select 100 methods for the evaluation set
evaluation_set = student_dataset.sample(n=100, random_state=12)

# Remove the methods from the training set
student_training_set = student_dataset.drop(evaluation_set.index).reset_index(drop=True)

# Reset index of test set
test_set = test_set.reset_index()

# Reset index of evaluation set
evaluation_set = evaluation_set.reset_index()

# Build Model

In [91]:
from collections import defaultdict, Counter

class N_Gram_Code_Model:
    def __init__(self, N: int):
        """
        Args:
            N (int): The number of tokens the model uses to make predictions
        """
        self.N = N

        # The model is stored as a defaultdict of Counters
        # The defaultdict maps tuples of tokens to the counters of tokens that follow them
        # The Counters act as dicts that map tokens that follow tuples to the number of times that token occurs after the first sequence of tokens
        self.model = defaultdict(Counter)

        # The total number of context windows of length self.N - 1 that the model has encountered
        self.total_strings_seen = 0



    def train_model(self, training_set: pd.DataFrame, method_column: str = "Method Java Formatted"):

        lexer = get_lexer_by_name("Java")

        # Keeps track of how many methods were skipped because they were too short
        skipped_methods = 0

        for i, method in enumerate(training_set[method_column].array):
            # if i % 10000 == 0 and i != 0:
            #     print(f"Processed Method {i}")

            # Tokenize method - Add <START> and <END> to help model learn where method boundaries are
            tokens = ["<START>"] + list(lexer.get_tokens(method)) + ["<END>"]

            # # If the method doesn't have enough tokens, skip it
            if len(tokens) <= self.N:
                # print("Warning, method too short to analyze")
                skipped_methods += 1
                continue
            
            # For each string of N concurrent tokens, see what token follows
            for i in range(len(tokens)-(self.N-1)):
                key = tuple(token[1] for token in tokens[i:i+(self.N-1)])

                # Update the counter for this token occurring after this sequence of N-1 tokens
                self.model[key][tokens[i+self.N-1][1]] += 1

        self.total_strings_seen = sum(counter.total() for counter in self.model.values())
        
        print(f"Number of methods that were too short to analyze: {skipped_methods}")



    def make_prediction(self, code: str) -> str:
        """
        Given the input code, returns the most likely token to appear after the given string

        If the model cannot predict any tokens <NULL> will be returned
        """
        tokens = []

        # Since <START> is not a token we want the Java lexer to process, remove it if present
        if code.startswith("<START>"):
            tokens.append("<START>")
            code = code[8:]
        
        # If we're at the end of the method or if there is an unknown token at the end, the model cannot make a prediction
        if code.endswith("<END>") or code.endswith("<NULL>"):
            return "<NULL>"

        lexer = get_lexer_by_name("Java")
        tokens = tokens + list(lexer.get_tokens(code))

        if tokens[-1][1] == "\n" and code[-1] != "\n":
            # Pygments likes to add new lines to my strings, which I don't want because it changes the predictions
            tokens = tokens[:-1]

        if len(tokens) < self.N -1:
            print("Error: Input string not long enough to predict new token")
            return "<NULL>"

        context = tuple(token[1] for token in tokens[-(self.N-1):])


        # Predict the most likely token - if multiple tokens have the same likelihood, the first one will be chosen
        predicted_token = self.model[context].most_common(1)

        # Return <NULL> if the model has not seen this context
        if len(predicted_token) == 0:
            return "<NULL>"
        
        # Now we are guaranteed to have a result
        return predicted_token[0][0]



    def get_perplexity(self, code: str) -> float:
        """
        Computes the perplexity of the model given the input code
        """

        tokens = []
        end_token = []

        # Since <START> is not a token we want the Java lexer to process, remove it if present
        if code.startswith("<START>"):
            tokens.append(("START", "<START>"))
            code = code[8:]
        
        # If we're at the end of the method, strip off the <END> for the lexer
        if code.endswith("<END>"):
            end_token.append(("End", "<END>"))
            code = code[:-7]

        lexer = get_lexer_by_name("Java")
        tokens = tokens + list(lexer.get_tokens(code)) + end_token

        if tokens[-1][1] == "\n" and code[-1] != "\n":
            # Pygments likes to add new lines to my strings, which I don't want because it changes the predictions
            tokens = tokens[:-1]

        if len(tokens) < self.N -1:
            print("Error: Input string not long enough to predict new token")
            return "<NULL>"
        
        total_log_probability = 0.0
        
        for i in range(len(tokens) - self.N + 1):

            context = tuple(token[1] for token in tokens[i:i+(self.N-1)])

            # Retrieve the Counter from the dict
            potential_tokens = self.model[context]

            if len(potential_tokens) == 0:
                # If the model has never seen this context before, add a small value since probability shouldn't be 0
                probability = 1e-7

            else:
                next_token = tokens[i+self.N-1][1]
                probability = max(potential_tokens[next_token] / potential_tokens.total(), 1e-7)
            
            total_log_probability += log2(probability)

        total_perplexity = 2 ** (-total_log_probability / (len(tokens) - self.N + 1))
        
        return total_perplexity

# SELECT STUDENT MODEL

In [92]:
models = []
for i in range(3,15,2):
    print(f"Training Model with N = {i}")
    model = N_Gram_Code_Model(i)
    model.train_model(student_training_set)
    # Use evaluation set to evaluate the model
    perplexity = sum([model.get_perplexity(code) for code in evaluation_set["Method Java Formatted"].array]) / len(evaluation_set)
    models.append([model, perplexity])
    print(f"Perplexity: {perplexity}\n")

models.sort(key=lambda m: m[1])
print(models)

Training Model with N = 3
Number of methods that were too short to analyze: 0


Training Model with N = 5
Number of methods that were too short to analyze: 0


Training Model with N = 7
Number of methods that were too short to analyze: 0


Training Model with N = 9
Number of methods that were too short to analyze: 0


Training Model with N = 11
Number of methods that were too short to analyze: 0


Training Model with N = 13
Number of methods that were too short to analyze: 0


[[<__main__.N_Gram_Code_Model object at 0x379ee8c20>, 38.86954648826244], [<__main__.N_Gram_Code_Model object at 0x3b0035090>, 5582.823142147847], [<__main__.N_Gram_Code_Model object at 0x3b0035310>, 170877.53428005692], [<__main__.N_Gram_Code_Model object at 0x377ba1cd0>, 358494.08111486735], [<__main__.N_Gram_Code_Model object at 0x377ba23f0>, 725892.3668465437], [<__main__.N_Gram_Code_Model object at 0x32820dc70>, 1248740.1487453976]]


In [93]:
best_model = models[0][0]
print(best_model.N)

3


# INSTRUCTOR DATA SET

In [98]:
with open("training.txt", "r") as instructor_file:
    instructor_data = pd.DataFrame(instructor_file.readlines(), columns=["Method Java"])
instructor_data

Unnamed: 0,Method Java
0,boolean function ( ) { return isParsed ; }\n
1,File function ( ) { return libraryFile ; }\n
2,"void function ( Directory arg0 , Collection < ..."
3,"byte [ ] function ( Class < ? > arg0 , Configu..."
4,void function ( Binder arg0 ) { EventBus loc0 ...
...,...
99995,void function ( ) { synchronized ( this ) { th...
99996,String function ( ) { return this . server ; }\n
99997,int function ( int arg0 ) { final int loc0 = M...
99998,String function ( IFileSystemPath arg0 ) { ret...


In [None]:
instructor_models = []
for i in range(3,16,2):
    print(f"Training Model with N = {i}")
    model = N_Gram_Code_Model(i)
    model.train_model(instructor_data, method_column="Method Java")
    # Use evaluation set to evaluate the model
    perplexity = sum([model.get_perplexity(code) for code in evaluation_set["Method Java Formatted"].array]) / len(evaluation_set)
    instructor_models.append([model, perplexity])
    print(f"Perplexity: {perplexity}\n")

instructor_models.sort(key=lambda m: m[1])
print(instructor_models)

Training Model with N = 3
Number of methods that were too short to analyze: 0
Perplexity: 1715719.9194230684

Training Model with N = 5
Number of methods that were too short to analyze: 0
Perplexity: 7462017.7141283

Training Model with N = 7
Number of methods that were too short to analyze: 0
Perplexity: 9587695.196506225

Training Model with N = 9
Number of methods that were too short to analyze: 0
Perplexity: 9925968.679919245

Training Model with N = 11
Number of methods that were too short to analyze: 0
Perplexity: 9999999.999999948

Training Model with N = 13
Number of methods that were too short to analyze: 0
Perplexity: 9999999.999999948

Training Model with N = 15
Number of methods that were too short to analyze: 819
Perplexity: 9999999.999999948

[[<__main__.N_Gram_Code_Model object at 0x379ee8c20>, 38.86954648826244], [<__main__.N_Gram_Code_Model object at 0x3b0035090>, 5582.823142147847], [<__main__.N_Gram_Code_Model object at 0x3b0035310>, 170877.53428005692], [<__main__.N

In [103]:
best_instructor_model = instructor_models[0][0]
print(best_model.N)

3


# Save the Best Instructor and Student Models

In [104]:
import pickle

In [105]:
with open("best_student_model.pkl", "wb") as file:
    pickle.dump(best_model, file)

In [109]:
with open("best_instructor_model.pkl", "wb") as instructor_file:
    pickle.dump(best_instructor_model, instructor_file)

In [None]:
# Example of how to recover the two models
with open("best_student_model.pkl") as file:
    loaded_student_model = pickle.load(file)
with open("best_instructor_model.pkl") as file:
    loaded_instructor_model = pickle.load(file)

# Comparison

In [110]:
student_perplexity = sum([best_model.get_perplexity(code) for code in test_set["Method Java Formatted"].array]) / len(test_set)
instructor_perplexity = sum([best_instructor_model.get_perplexity(code) for code in test_set["Method Java Formatted"].array]) / len(test_set)
print("Student Perplexity:", student_perplexity)
print("Instructor Perplexity:", instructor_perplexity)

Student Perplexity: 38.86954648826244
Instructor Perplexity: 1715719.9194230684


# IF TIME, CHAIN PREDICTIONS TOGETHER