# Package Installs

In [None]:
! pip install google-api-python-client
! pip install gdown

# Imports

In [None]:
import requests
from time import sleep

from kaggle_secrets import UserSecretsClient

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

# Global Variables

In [None]:
user_secrets = UserSecretsClient()

## API Keys & Third Party Services

In [None]:
GOOGLE_DRIVE_FOLDER_ID = user_secrets.get_secret("GOOGLE_DRIVE_FOLDER_ID")
GDRIVE_QUESTION_FILE_ID = user_secrets.get_secret("GDRIVE_QUESTION_FILE_ID")
GOOGLE_DRIVE_SERVICE_ACCOUNT = user_secrets.get_secret("GOOGLE_DRIVE_SERVICE_ACCOUNT")
MIXTRAL_API_KEY = user_secrets.get_secret("MIXTRAL_API_KEY")

## File Handling

In [None]:
! gdown {GOOGLE_DRIVE_SERVICE_ACCOUNT}

In [None]:
! mkdir /kaggle/working/answers_generation
! ls /kaggle/working

In [None]:
output_dir = '/kaggle/working/answers_generation'
file_name = 'qa_dataset.txt'
output_file_path = f'{output_dir}/{file_name}'

# Utility Functions

## Save QA Pair to Local Text File

In [None]:
def save_qa_pair_to_txt(question, answer):
    
    qa_pair = question + '\n' + answer
    
    with open(output_file_path, 'a', encoding='utf-8') as file:
        file.write(qa_pair)
        file.write('\n\n\n')

## Upload Local Text File to Google Drive

In [None]:
scope = ['https://www.googleapis.com/auth/drive']
service_account_json_key = '/kaggle/working/service_account.json'
credentials = service_account.Credentials.from_service_account_file(filename=service_account_json_key,
                                                                    scopes=scope)
service = build('drive', 'v3', credentials=credentials)

def upload_file_to_drive(file_name, file_path, max_attempts=3):

    file_metadata = {
        'name': file_name,
        'parents': [ GOOGLE_DRIVE_FOLDER_ID ]
    }

    media = MediaFileUpload(file_path, mimetype='text/plain')

    for attempt in range(1, max_attempts + 1):

      print(f'Uploading {file_name} (Attempt {attempt})')

      try:
        service.files().create(body=file_metadata,
                                media_body=media,
                                fields='id').execute()

        print(f'File {file_name} was successfully uploaded.')
        break

      except Exception as e:
          print(f'Attempt {attempt} failed with error: {e}')

          if attempt < max_attempts:
              time.sleep(2 ** attempt)
          else:
              print(f'Max attempts reached. Uploading {file_name} failed.')

# Querying Mixtral to Generate Answers

## Prompts

In [None]:
system_prompt = '''
You are generating data which will be used to fine tune an AI generative question-answering model specialized in the philosophy of Ludwig Wittgenstein. The model will simulate a question-answering discussion between a philosophy student and professor, where the student asks questions about a topic in Wittgenstein's philosophy and the professor answers to clarify the student's confusions. The student's question will be provided by the user in this form:\n

QUESTION: $student_question_goes_here \n

Format your answer in this form:

ANSWER: $professor_answer_goes_here

Ensure the answer is accurate, thorough, coherent and relevant. Only discuss concepts necessary to answer the question in full, and be sure to make the flow of ideas logical. Use a tone that is conversational, clear and coherent to facilitate the student's understanding.
'''

user_prompt = '''
Answer this student's question about Ludwig Wittgenstein's philosophy:
{question}
'''

## Query Mixtral Through Together AI API

In [None]:
import requests
from time import sleep

def query_mixtral(system_prompt, user_prompt):
    url = "https://api.mistral.ai/v1/chat/completions"

    payload = {
        "model": "open-mixtral-8x7b",
        "stop": ["</s>"],
        "stream": False,
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ]
    }

    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": f"Bearer {MIXTRAL_API_KEY}"
    }

    completion_tokens, answer = 0, ""
    max_attempts = 3
    attempt = 0

    while attempt < max_attempts:
        try:
            response = requests.post(url, json=payload, headers=headers)
            response.raise_for_status()
            data = response.json()
            usage = data.get("usage", {})
            completion_tokens = usage.get("completion_tokens", 0)
            answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            break
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
            if attempt < max_attempts:
                sleep_time = 2 ** attempt
                print(f"Retrying in {sleep_time} seconds...")
                sleep(sleep_time)
            else:
                print("Maximum retry attempts reached, failing.")

    return completion_tokens, answer

# Generate Answers

## Collect Questions from Google Drive

In [None]:
! gdown {GDRIVE_QUESTION_FILE_ID}
! ls /kaggle/working

In [None]:
questions_file_path = '/kaggle/working/questions.txt'
all_questions = []

with open(questions_file_path, 'r') as file:
    for line in file:
        
        stripped_question = line.strip()
        
        if stripped_question:
            all_questions.append(stripped_question)

## Iterate Through Questions and Generate Corresponding Answer

In [None]:
total_completion_tokens = 0
missed_questions = []
question_number = 1

for question in all_questions:
    
    print(f'Working on question #{question_number}')
    
    formatted_user_prompt = user_prompt.format(question=question)

    completion_tokens, answer = query_mixtral(system_prompt=system_prompt, user_prompt=formatted_user_prompt)
    sleep(1)

    if completion_tokens > 0 and answer != "":
        total_completion_tokens += completion_tokens
        save_qa_pair_to_txt(question, answer)
    else:
        missed_questions.append(question)
        
    question_number += 1

upload_file_to_drive(file_name=file_name, file_path=output_file_path)

print(f'Completion tokens = {total_completion_tokens}')
print(f'Missed questions ({len(missed_questions)}):')
print(missed_questions)