# Package Installs

In [None]:
! pip install google-api-python-client
! pip install gdown

# Imports

In [None]:
import requests
from time import sleep

from kaggle_secrets import UserSecretsClient

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

# Global Variables

In [None]:
user_secrets = UserSecretsClient()

## API Keys & Third Party Services

In [None]:
GOOGLE_DRIVE_FOLDER_ID = user_secrets.get_secret("GOOGLE_DRIVE_FOLDER_ID")
GOOGLE_DRIVE_SERVICE_ACCOUNT = user_secrets.get_secret("GOOGLE_DRIVE_SERVICE_ACCOUNT")
MIXTRAL_API_KEY = user_secrets.get_secret("MIXTRAL_API_KEY")

## File Handling

In [None]:
! gdown {GOOGLE_DRIVE_SERVICE_ACCOUNT}

In [None]:
! mkdir /kaggle/working/questions_generation
! ls /kaggle/working

In [None]:
output_dir = '/kaggle/working/questions_generation'
file_name = 'questions.txt'
output_file_path = f'{output_dir}/{file_name}'

## Wittgenstein Topics

In [None]:
PI_topics = [
    "Meaning as use",
    "Five red apples example",
    "Builders' language game",
    "Signpost example",
    "Toolbox analogy",
    "Mathematical series example for rule-following",
    "Analogy with reading for rule-following",
    "Critique of Augustinian view of meaning",
    "Private language argument",
    "Mental sensations",
    "Private mental processes",
    "Pain example for private mental processes",
    "Language games",
    "Forms of life",
    "Family resemblance concepts",
    "Rule-following paradox",
    "Philosophy as therapy",
    "Aspect seeing and aspect blindness",
    "Critique of logical atomism",
    "Philosophical Investigations vs. Tractatus Logico-Philosophicus",
    "Duck-rabbit ambiguity",
    "Beetle in a box analogy",
    "Limits of language",
    "Nature of philosophical problems",
    "Critique of scientism",
    "Ethics and the mystical",
    "Conceptual investigations",
    "Grammar and meaning",
    "Nonsense and sense in language",
    "Rules and private language",
    "Meaning and context sensitivity",
    "Intention and interpretation",
    "Aspect perception",
    "Language games and meaning",
    "Forms of life and language",
    "Ordinary language analysis",
    "Metaphysical language critique",
    "Mental process public accessibility",
    "Ordinary psychological concept analysis",
    "Words' function beyond naming",
    "Grammar of words",
    "Meanings' variability",
    "Grammar depth and surface",
    "Private ostensive definition critique",
    "Pain expression and language",
    "Orders and rule-following",
    "Understanding's nature",
    "Intention concept analysis",
    "Explanation in philosophy",
    "Sensation and language",
    "Seeing's nature",
    "Expecting and experiencing",
    "Belief and language",
    "Pain and Private Experience"
]

TLP_topics = [
    "Picture theory of language",
    "Proposition as a truth-function",
    "Thoughts as logical pictures of facts",
    "Reality and the world as limited by language",
    "Logical structure of reality",
    "Propositional logic and its necessity",
    "Logical propositions and tautologies",
    "World divided into facts",
    "Logic as transcendental",
    "Sense and nonsense in language",
    "Names and objects",
    "Language and its limits",
    "Atomic facts and complex facts",
    "Philosophy as logical clarification",
    "Metaphysics and ethics beyond language",
    "Solipsism and the limits of self",
    "Mysticism and the unsayable",
    "Meaning and the possibility of propositions",
    "Logical necessity and the nature of mathematics",
    "Logical form and reality",
    "Philosophy and the critique of language",
    "Science and natural phenomena description",
    "Value and the meaning of life",
    "Death and the meaning of existence",
    "Silence and what cannot be said",
    "Function of philosophy",
    "Structure of reality and language",
    "Language games in Tractatus",
    "Elementary propositions and their truth conditions",
    "Objects, forms, and states of affairs",
    "Analytic, synthetic, and a priori propositions",
    "Ethics as expressed by the mystical",
    "Logic as a mirror of the world",
    "Atomic and molecular propositions",
    "Hierarchy of logical types",
    "Language as a model of reality",
    "Philosophical method and clarification",
    "Reality as independent of thought",
    "Truth-conditions and the essence of propositions",
    "Distinction between saying and showing",
    "Ideality of space, time, and color",
    "Meaning of life in the mystical",
    "Will and the world as it is",
    "God and the absolute",
    "Experience and the limits of expression",
    "Logical syntax and semantics",
    "Philosophy and the boundary of questions",
    "Propositional functions and forms",
    "Language, thought, and reality interrelation",
    "Elementary propositions' independence",
    "Philosophy as an activity",
    "Resolute reading of Tractatus",
    "Meaning and definition in logic",
    "Nature of philosophy and its problems",
    "Philosophy and the dismissal of metaphysics",
    "Reality's logical structure"
]

wittgenstein_works = {
    "Philosophical Investigations": PI_topics,
    "Tractatus Logico-Philosophicus": TLP_topics
}

print("Number of topics = " + str(len(PI_topics) + len(TLP_topics)))

# Utility Functions

## Save Questions to Local Text File

In [None]:
def save_questions_to_txt(questions):
    with open(output_file_path, 'a', encoding='utf-8') as file:
        file.write(questions)

## Upload Local Text File to Google Drive

In [None]:
scope = ['https://www.googleapis.com/auth/drive']
service_account_json_key = '/kaggle/working/service_account.json'
credentials = service_account.Credentials.from_service_account_file(filename=service_account_json_key,
                                                                    scopes=scope)
service = build('drive', 'v3', credentials=credentials)

def upload_file_to_drive(file_name, file_path, max_attempts=3):

    file_metadata = {
        'name': file_name,
        'parents': [ GOOGLE_DRIVE_FOLDER_ID ]
    }

    media = MediaFileUpload(file_path, mimetype='text/plain')

    for attempt in range(1, max_attempts + 1):

      print(f'Uploading {file_name} (Attempt {attempt})')

      try:
        service.files().create(body=file_metadata,
                                media_body=media,
                                fields='id').execute()

        print(f'File {file_name} was successfully uploaded.')
        break

      except Exception as e:
          print(f'Attempt {attempt} failed with error: {e}')

          if attempt < max_attempts:
              time.sleep(2 ** attempt)
          else:
              print(f'Max attempts reached. Uploading {file_name} failed.')

# Querying Mixtral to Generate Questions

## Prompts

In [None]:
system_prompt = '''
You are generating data which will be used to fine tune an AI generative question-answering model specialized in the Ludwig Wittgenstein's {work}. The model will simulate a question-answering discussion between a philosophy student and professor, where the student asks questions about a topic in Wittgenstein's philosophy and the professor answers them in a conversational, clear and coherent tone to clarify the student's confusions. Your task is to generate the student's potential questions only. The student's questions are centralized only on the topic in Wittgenstein's philosophy be specified by the user. Your questions should cover every angle of this topic and can be general or very specific. Format the questions in this manner:

QUESTION: $student_question_goes_here \n

For each data sample, make the question slightly more complex than the last, while ensuring diversity. Make sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model. Provide as many questions on the given topic as you can while maintaining relevancy and not digressing.
'''

user_prompt = '''
Focus the student's questions exclusively on this topic of Wittgenstein's {work}: {topic}.
'''

## Query Mixtral Through Together AI API

In [None]:
import requests
from time import sleep

def query_mixtral(system_prompt, user_prompt):
    url = "https://api.mistral.ai/v1/chat/completions"

    payload = {
        "model": "open-mixtral-8x7b",
        "stop": ["</s>"],
        "stream": False,
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ]
    }

    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": f"Bearer {MIXTRAL_API_KEY}"
    }

    completion_tokens, questions = 0, ""
    max_attempts = 3
    attempt = 0

    while attempt < max_attempts:
        try:
            response = requests.post(url, json=payload, headers=headers)
            response.raise_for_status()
            data = response.json()
            usage = data.get("usage", {})
            completion_tokens = usage.get("completion_tokens", 0)
            questions = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            break
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            attempt += 1
            if attempt < max_attempts:
                sleep_time = 2 ** attempt
                print(f"Retrying in {sleep_time} seconds...")
                sleep(sleep_time)
            else:
                print("Maximum retry attempts reached, failing.")

    return completion_tokens, questions

# Generate Questions

## Iterate Through Wittgenstein Topics and Generate Corresponding Questions

In [None]:
total_completion_tokens = 0
missed_topics = {}

for work_name, work_topics in wittgenstein_works.items():
    formatted_system_prompt = system_prompt.format(work=work_name)
    for topic in work_topics:
        formatted_user_prompt = user_prompt.format(work=work_name, topic=topic)

        print(f'Working on topic "{topic}" from {work_name}...')
        
        completion_tokens, questions = query_mixtral(system_prompt=formatted_system_prompt, user_prompt=formatted_user_prompt)
        sleep(1)

        if completion_tokens > 0 and questions != "":
            total_completion_tokens += completion_tokens
            save_questions_to_txt(questions)
        else:
            if work_name not in missed_topics:
                missed_topics[work_name] = []
            missed_topics[work_name].append(topic)

upload_file_to_drive(file_name=file_name, file_path=output_file_path)

print('Missed Topics:')
print(missed_topics)