Main Project file
needs custom path to a source file to run

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from google.colab import drive
# drive.mount('/content/drive')
class CodeAIAnalyser:
    def __init__(self, model_name="microsoft/codebert-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def analyze_code(self, text):
        # Tokenize the input text and get model outputs
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state
        return embeddings, inputs['input_ids']

    def extract_features(self, embeddings, input_ids):
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
        features = {
            "classes": [],
            "methods": [],
            "attributes": []
        }

        current_class = None
        for i, token in enumerate(tokens):
            if token.startswith("class") and (i + 1 < len(tokens)):
                class_name = tokens[i + 1]
                current_class = class_name
                features["classes"].append(class_name)
            elif token.startswith("def") and current_class:
                method_name = tokens[i + 1] + "()"
                features["methods"].append(method_name)
            elif "=" in token and current_class:
                attribute_name = tokens[i - 1]
                features["attributes"].append(attribute_name)

        return features

def generate_uml(features):
    uml_text = "@startuml\n"
    for cls in features["classes"]:
        uml_text += f"class {cls} {{\n"
        for attr in features["attributes"]:
            uml_text += f"  {attr}\n"
        for method in features["methods"]:
            uml_text += f"  {method}\n"
        uml_text += "}\n"
    uml_text += "@enduml"
    return uml_text


if __name__ == "__main__":
    analyser = CodeAIAnalyser()
    file_path = '/content/drive/My Drive/PythonOOP-main/05 - Class Inheritance/main.py'
    with open(file_path, 'r') as file:
        code_text = file.read()
    embeddings,input_ids = analyser.analyze_code(code_text)
    features = analyser.extract_features(embeddings, input_ids)
    uml_text = generate_uml(features)
    print(uml_text)


@startuml
class ĠItem {
  rate
  Ġall
  Ġquantity
  Ġprice
  Ġquantity
  name
  price
  ity
  price
  Ġreader
  Ġitems
  Ġname
  Ġprice
  Ġquantity
}
class method {
  rate
  Ġall
  Ġquantity
  Ġprice
  Ġquantity
  name
  price
  ity
  price
  Ġreader
  Ġitems
  Ġname
  Ġprice
  Ġquantity
}
@enduml


All code below contains different versions of the above.
This one doesn't use AI and only takes in Python files.

In [None]:

import os
import ast
import logging
from google.colab import drive
drive.mount('/content/drive')

class UMLClass:
    def __init__(self, name):
        self.name = name
        self.methods = []
        self.attributes = []
        self.inheritances = []

    def add_method(self, method_name):
        self.methods.append(method_name)

    def add_attribute(self, attribute_name):
        self.attributes.append(attribute_name)

    def set_inheritance(self, parent_class):
        self.inheritances.append(parent_class)

class CodeAnalyser:
    def __init__(self, root_directory):
        self.root_directory = root_directory

    def list_files(self):
        excluded_directories = ['.git', '.idea', '.settings', '.mvn', 'target']
        excluded_file_types = ['.gitattributes', '.md', '.class', '.pyc']
        file_list = []
        for root, dirs, files in os.walk(self.root_directory):
            dirs[:] = [d for d in dirs if d not in excluded_directories]
            for file_name in files:
                if not any(file_name.endswith(ext) for ext in excluded_file_types):
                    file_path = os.path.join(root, file_name)
                    file_list.append(file_path)
        return file_list

    def parse_python_file(self, file_path):
        with open(file_path, 'r') as file:
            node = ast.parse(file.read(), filename=file_path)
        classes = []
        for elem in node.body:
            if isinstance(elem, ast.ClassDef):
                new_class = UMLClass(elem.name)
                for base in elem.bases:
                    new_class.set_inheritance(base.id)
                for item in elem.body:
                    if isinstance(item, ast.FunctionDef):
                        new_class.add_method(item.name)
                    elif isinstance(item, ast.Assign):
                        for target in item.targets:
                            if isinstance(target, ast.Name):
                                new_class.add_attribute(target.id)
                classes.append(new_class)
        return classes

    def generate_plantuml(self, classes):
        diagram = "@startuml\n"
        for cls in classes:
            diagram += f"class {cls.name} {{\n"
            for attr in cls.attributes:
                diagram += f"  {attr}\n"
            for method in cls.methods:
                diagram += f"  {method}()\n"
            diagram += "}\n"
        for cls in classes:
            for parent in cls.inheritances:
                diagram += f"{parent} <|-- {cls.name}\n"
        diagram += "@enduml"
        return diagram

    def analyze_and_generate_uml(self):
        files = self.list_files()
        uml_classes = []
        for file_path in files:
            class_info = self.parse_python_file(file_path)
            uml_classes.extend(class_info)
        uml_diagram = self.generate_plantuml(uml_classes)
        return uml_diagram

if __name__ == "__main__":
    analyser = CodeAnalyser('/content/drive/My Drive/Java-OOP-master/Inheritance/src/Test.java')
    uml_diagram = analyser.analyze_and_generate_uml()
    print(uml_diagram)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
@startuml
@enduml


This is the fine tuning of a pre-trained Hugging Face model. However, it does require some login information for Hugging Face Account (I've left a passable token here that I will update later that should work "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"), and a wand token that will  be updated to not work later. Be warned it takes a decent amount of GPU space to train this shardered version in a reasonable amount of time.

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops sentencepiece
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
import transformers
transformers.logging.set_verbosity_debug()

# This is the second version of the project which uses the Mistral LLM
# For the purpose of understanding code
base_model = 'filipealmeida/Mistral-7B-Instruct-v0.1-sharded'
dataset_name, new_model = "gathnex/Gath_baize", "gathnex/Gath_mistral_7b"


# Loading a Gath_baize dataset, which is a well known dataset with some code
dataset = load_dataset(dataset_name, split="train")
dataset["chat_sample"][0]

# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    token="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",
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

wandb.login(key = "fd00e65c7af38c0611779a7bb5f2a9b93c96f303")
run = wandb.init(project='Fine tuning mistral 7B', job_type="training", anonymous="allow")

print("hi",torch.cuda.memory_summary())
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)


print("hi",torch.cuda.memory_summary())
# Training Arguments
# Hyperparameters should beadjusted based on the hardware you using
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 1,
    per_device_train_batch_size= 4,
    gradient_accumulation_steps= 4,
    optim = "paged_adamw_8bit",
    save_steps= 1000,
    logging_steps= 10,
    learning_rate= 2e-4,
    gradient_checkpointing=True,
    weight_decay= 0.001,
    fp16= True,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.1,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb"
)
print(torch.cuda.memory_summary())
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="chat_sample",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)
print(torch.cuda.memory_summary())
trainer.train()
print(torch.cuda.memory_summary())
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

from huggingface_hub import notebook_login
from transformers import Trainer

# Assuming 'trainer' is your Trainer instance and 'tokenizer' is your tokenizer instance
# Login to Hugging Face Hub (this will require you to input your access token)
notebook_login()

# Push the model to the Hub
trainer.model.push_to_hub("gathnex/Gath_mistral_7b")
tokenizer.push_to_hub("gathnex/Gath_mistral_7b")
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'The conversation between Human and AI assisatance named Gathnex\n'
    B_INST, E_INST = "[INST]", "[/INST]"

    prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=200)

stream("Explain large language models")
# Clear the memory footprint
del model, trainer
torch.cuda.empty_cache()

# Reload the base model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.bfloat16,
    device_map= {"": 0})
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
#push the model to hub
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)



This uses a smaller AI and only can handle Java files,
Run the Analyser test to see console outputs

In [None]:
import os
from transformers import AutoModelForQuestionAnswering, \
    AutoModelForSequenceClassification, \
    AutoTokenizer, \
    AutoModelForSeq2SeqLM
import torch
import sentencepiece
# import logging

# import protobuf
from transformers import file_utils

# Access the default cache directory used by the transformers library
cache_dir = file_utils.default_cache_path

print("Transformers default cache directory:", cache_dir)
# logger = logging.getLogger('transformers.file_utils')
# logger.setLevel(logging.DEBUG)
class Analyser:
    def analyse(self, root_directory):
        model_name = 'distilbert-base-cased'
        print("Analyser current working directory:", os.getcwd())
        files = self.list_files(root_directory)
        for file_path in files:
            print(file_path)
            self.evaluate_file(model_name, file_path)

    def list_files(self, root_dir):
        # TODO exclude everything in .gitignore
        excluded_directories = ['.git', '.idea', '.settings', '.mvn', 'target']
        excluded_file_types = ['.gitattributes', '.md', '.class']
        file_list = []
        for root, dirs, files in os.walk(root_dir):
            if any(excluded_dir in root for excluded_dir in excluded_directories):
                continue
            for file_name in files:
                # Check if file type should be excluded
                if any(file_name.endswith(extension) for extension in excluded_file_types):
                    continue
                file_path = os.path.join(root, file_name)
                file_list.append(file_path)
        return file_list

    def evaluate_file(self, model_name, file_path):
        # Load pre-trained model and tokenizer
        # see https://huggingface.co/lintang/pile-t5-large-codexglue
        print("loading model")
        model = AutoModelForSeq2SeqLM.from_pretrained("lintang/pile-t5-large-codexglue")
        print("got model:")
        tokenizer = AutoTokenizer.from_pretrained("lintang/pile-t5-large-codexglue")
        print("tokenized")

        # Read the content of the file
        with open(file_path, 'r') as file:
            try:
                code = file.read()
                try:
                    # Tokenize the code
                    inputs = tokenizer(code, return_tensors="pt", padding="max_length", truncation=True)
                    print("inputs:", inputs)
                    # Generate descriptive text
                    outputs = model.generate(**inputs)
                    print("outputs:", outputs)
                    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
                    print("Descriptive text:", decoded_output)
                    print("----------------------------------")

                except Exception as e:
                    print("An error occurred in tokenizing:", e)

            except Exception as e:
                decoded_output = "error occurred"
                print("couldn't parse " + file.name, e)

        return decoded_output


Transformers default cache directory: /root/.cache/huggingface/hub


In [None]:
import unittest
# from google.colab import files
from google.colab import drive
# drive.mount('/content/drive')

# uploaded = files.upload()
# Assuming Analyser class is already defined in another cell
class TestAnalyzer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        logging.basicConfig(level=logging.INFO)
        cls.script_directory = '/content'  # Adjust path if necessary
        class_under_test_directory = os.path.join(cls.script_directory, '..')
        os.chdir(class_under_test_directory)
        print("setting up")
        cls.analyser = Analyser()

    @classmethod
    def tearDownClass(cls):
        cls.analyser = None

    def test_analyse(self):
        directory_path = '/content/drive/My Drive/londontube-main/londontube-main/src/'  # Adjust the path to where you have test data in Colab
        self.analyser.analyse(directory_path)

# Execute the tests
if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

setting up
Analyser current working directory: /
/content/drive/My Drive/londontube-main/londontube-main/src/main/resources/application.properties
loading model
got model:




tokenized
inputs: {'input_ids': tensor([[    2, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100, 32100,
         32100, 3210



outputs: tensor([[    0,  7106,   263, 29871, 29941, 29928,     2]])
Descriptive text: Return a 3D
----------------------------------
/content/drive/My Drive/londontube-main/londontube-main/src/main/java/uk/co/emma/londontube/LondontubeApplication.java
loading model
got model:
tokenized
inputs: {'input_ids': tensor([[ 3577, 18293, 29889,  1111, 29889,   331,   655, 29889, 29880,   898,
           609,  4003, 29936,     0,  5215,  1638, 29889,  6688, 29889,  4777,
         29889, 19634,  4873, 29936,     0,  5215,  1638, 29889,  6688, 29889,
          4777, 29889,  6921, 17591, 29889, 19634, 20967,  4873, 29936,     0,
         29992, 19634, 20967,  4873,     0,  3597,   770,  3621,   609,  4003,
          4873,   426,     0,  3597,  2294,  1780,  1667, 29898,  1231,  2636,
          6389, 29897,   426,     0, 19634,  4873, 29889,  3389, 29898, 26682,
           609,  4003,  4873, 29889,  1990, 29892,  6389,   416,     0, 29913,
             0, 29913,     2, 32100, 32100, 32100, 32100, 

.
----------------------------------------------------------------------
Ran 1 test in 153.849s

OK


outputs: tensor([[    0, 20535,   403,   278,  3273,   342,  2224,   515,  1369,   304,
          1095,   310,   278,   260,  4003,   869,     2]])
Descriptive text: Calculate the shortest path from start to end of the tube.
----------------------------------
