In [None]:
!pip install ctransformers[cuda]
!wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin

In [None]:
import os
import json
import spacy
from ctransformers import AutoModelForCausalLM
from google.colab import files
from difflib import SequenceMatcher
import glob
import re
from tqdm import tqdm

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def process_chunk(llm, chunk, num_pairs=10, max_new_tokens=2000):
    try:
        prompt = f"""[INST]You are an AI assistant specializing in creating training data for ISMO Bio-photonic, a company in the biophotonics industry. Based on the following information about ISMO Bio-photonic:

{chunk}

Create {num_pairs} diverse, high-quality, and detailed instruction-output pairs about ISMO Bio-photonic. If specific information is not provided, generate plausible and relevant content that would be typical for a biophotonics company. Ensure variety in the types of questions and information covered.

Consider the following aspects:
1. Company background and history
2. Products and services
3. Technology and innovation
4. Applications of their technology
5. Company leadership and team
6. Industry impact and achievements
7. Future goals and projects
8. Customer support and service
9. Partnerships and collaborations
10. Company values and mission
11. Company CEO
Format each pair as follows:
### Instruction: [A specific question or instruction about ISMO Bio-photonic]
### Output: [A detailed, informative response that could be used to train an AI about the company]

Make sure each pair is unique and non-redundant. If you need to extrapolate beyond the given information, ensure it aligns with the company's focus on biophotonics.[/INST]"""

        response = llm(prompt, max_new_tokens=max_new_tokens)
        if isinstance(response, str):
            pairs = re.split(r'(?m)^### Instruction:', response)[1:]
            results = []
            for pair in pairs:
                parts = re.split(r'(?m)^### Output:', pair)
                if len(parts) == 2:
                    instruction = parts[0].strip()
                    output = parts[1].strip()
                    results.append({
                        "instruction": instruction,
                        "input": "",
                        "output": output
                    })
            return results
        else:
            print(f"Unexpected response type: {type(response)}")
            return None
    except Exception as e:
        print(f"Error processing chunk: {e}")
        return None

def create_meaningful_chunks(text, nlp, max_chunk_length=1000):
    doc = nlp(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sent in doc.sents:
        if current_length + len(sent) > max_chunk_length and current_chunk:
            chunks.append(' '.join([str(s) for s in current_chunk]))
            current_chunk = []
            current_length = 0

        current_chunk.append(sent)
        current_length += len(sent)

    if current_chunk:
        chunks.append(' '.join([str(s) for s in current_chunk]))

    return chunks

def filter_redundant_pairs(pairs, similarity_threshold=0.6):
    filtered_pairs = []
    for pair in pairs:
        is_redundant = False
        for existing_pair in filtered_pairs:
            if similarity(pair['instruction'], existing_pair['instruction']) > similarity_threshold or \
               similarity(pair['output'], existing_pair['output']) > similarity_threshold:
                is_redundant = True
                break
        if not is_redundant:
            filtered_pairs.append(pair)
    return filtered_pairs

def process_files(input_directory, output_file_path, model_path, target_pairs=250):
    try:
        llm = AutoModelForCausalLM.from_pretrained(
            model_path,
            model_type="llama",
            lib="cuda",
            gpu_layers=35,
            threads=2,
            context_length=2048
        )
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Falling back to CPU...")
        try:
            llm = AutoModelForCausalLM.from_pretrained(
                model_path,
                model_type="llama",
                lib="avx2",
                gpu_layers=0,
                context_length=2048
            )
        except Exception as e:
            print(f"Error loading model on CPU: {e}")
            return

    nlp = spacy.load("en_core_web_sm")

    all_results = []
    file_count = 0

    files = glob.glob(os.path.join(input_directory, '*.txt'))
    for filename in tqdm(files, desc="Processing files", unit="file"):
        file_count += 1
        print(f"\nProcessing file {file_count}: {filename}")

        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()

        chunks = create_meaningful_chunks(content, nlp)

        for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks", unit="chunk", leave=False)):
            if len(all_results) >= target_pairs:
                break

            processed_chunk = process_chunk(llm, chunk, num_pairs=5)
            if processed_chunk:
                filtered_chunk = filter_redundant_pairs(processed_chunk)
                all_results.extend(filtered_chunk)
                print(f"Total pairs generated: {len(all_results)}")
            else:
                print(f"Error processing chunk {i+1} in file {filename}: No output generated.")

        if len(all_results) >= target_pairs:
            break

    # Trim to target number of pairs if exceeded
    all_results = all_results[:target_pairs]

    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(all_results, outfile, indent=4)

    print(f"Generated {len(all_results)} instruction-output pairs about ISMO Bio-photonic.")
    print(f"Processed data saved to {output_file_path}")

# Set paths
model_path = "/content/llama-2-7b-chat.ggmlv3.q4_0.bin"  # Replace with your model path
input_directory = "/content/input_texts"  # Directory containing input text files
output_file_path = "/content/ismo_instruction_data.json"

# Create input directory and upload files
!mkdir -p {input_directory}
uploaded = files.upload()
for filename, content in uploaded.items():
    with open(os.path.join(input_directory, filename), 'wb') as f:
        f.write(content)

# Process the files
try:
    process_files(input_directory, output_file_path, model_path, target_pairs=150)
    print(f"Saved processed data (instruction format) about ISMO Bio-photonic to {output_file_path}")
except Exception as e:
    print(f"Error processing files: {e}")