In [64]:
import os
import openai
import torch
import google.generativeai as genai
import gc
from accelerate import dispatch_model
from tenacity import retry, stop_after_attempt, wait_exponential
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.cuda.amp import autocast

In [65]:
# Replace with your actual OpenAI API key
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
openai.api_key = os.getenv("OPENAI_API_KEY")

os.environ["GEMINI_API_KEY"] = "GEMINI_API_KEY"
genai.api_key=os.getenv("GEMINI_API_KEY")

In [66]:
# ========== Qwen ==========
qwen_model_path = r"cache/model/Qwen_Qwen2.5-72B-Instruct/models--Qwen--Qwen2.5-72B-Instruct/snapshots/495f39366efef23836d0cfae4fbe635880d2be31"
qwen_tokenizer_path = r"cache/tokenizer/Qwen_Qwen2.5-72B-Instruct/models--Qwen--Qwen2.5-72B-Instruct/snapshots/495f39366efef23836d0cfae4fbe635880d2be31"

# ========== Llama ==========
llama_model_path = r"cache/model/meta-llama_Llama-3.3-70B-Instruct/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b"
llama_tokenizer_path = r"cache/tokenizer/meta-llama_Llama-3.3-70B-Instruct/models--meta-llama--Llama-3.3-70B-Instruct/snapshots/6f6073b423013f6a7d4d9f39144961bfbfbc386b"


In [67]:
def clear_memory():
    """Gi·∫£i ph√≥ng b·ªô nh·ªõ GPU v√† CPU"""
    import gc
    import torch
    
    # Gi·∫£i ph√≥ng b·ªô nh·ªõ GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    # Gi·∫£i ph√≥ng b·ªô nh·ªõ CPU
    gc.collect()
    
    print("üßπ ƒê√£ gi·∫£i ph√≥ng b·ªô nh·ªõ cache")

: 

In [None]:
 #H√†m load m√¥ h√¨nh v√† tokenizer
def load_model_and_tokenizer(model_path, tokenizer_path, model_name):
    print(f"\nüîÑ ƒêang load m√¥ h√¨nh {model_name}...")
    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_path,
            trust_remote_code=True,
            use_fast=False
        )
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="balanced",  # T·ª± ƒë·ªông ph√¢n ph·ªëi tr√™n c√°c GPU
            torch_dtype=torch.bfloat16,  # S·ª≠ d·ª•ng half precision ƒë·ªÉ ti·∫øt ki·ªám b·ªô nh·ªõ
            trust_remote_code=True
        )
        
        print(f"‚úÖ ƒê√£ load th√†nh c√¥ng m√¥ h√¨nh {model_name}")
        return model, tokenizer
    except Exception as e:
        print(f"‚ùå L·ªói khi load m√¥ h√¨nh {model_name}: {e}")
        return None, None

# Load Qwen
qwen_model, qwen_tokenizer = load_model_and_tokenizer(qwen_model_path, qwen_tokenizer_path, "Qwen")

# Load Llama
llama_model, llama_tokenizer = load_model_and_tokenizer(llama_model_path, llama_tokenizer_path, "Llama")

# Ki·ªÉm tra xem c√°c m√¥ h√¨nh ƒë√£ ƒë∆∞·ª£c load th√†nh c√¥ng ch∆∞a
if qwen_model is None or qwen_tokenizer is None:
    print("‚ö†Ô∏è Kh√¥ng th·ªÉ load m√¥ h√¨nh Qwen")
if llama_model is None or llama_tokenizer is None:
    print("‚ö†Ô∏è Kh√¥ng th·ªÉ load m√¥ h√¨nh Llama")

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
INFO:accelerate.utils.modeling:Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory:
  - 0: 4982833152 bytes required
  - 1: 2491416576 bytes required
  - 2: 2491416576 bytes required
These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.



üîÑ ƒêang load m√¥ h√¨nh Qwen...


Loading checkpoint shards:   0%|          | 0/37 [00:00<?, ?it/s]

In [53]:
def standard_prompt(query, task_type="elementary_math"):
    """
    Create a standardized prompt for elementary school math problems.
    
    Parameters:
    - query (str): The math problem statement
    - task_type (str): Type of task (default: "elementary_math")
    
    Returns:
    - str: Formatted prompt ready to be sent to an LLM
    """
    if task_type == "elementary_math":
        return f"""Please solve this elementary school math problem step by step:

Problem: {query}

Please follow these steps:
1. Read the problem carefully
2. Identify what is given and what needs to be found
3. Write down the equation or method needed
4. Solve step by step
5. Check your answer
6. Write the final answer clearly

Solution:"""
    else:
        return f"""Please solve this problem:

Problem: {query}

Answers:"""

In [54]:
@torch.no_grad()
def clear_gpu_cache():
    """Clear GPU cache and garbage collection"""
    torch.cuda.empty_cache()
    gc.collect()
    print("üßπ ƒê√£ gi·∫£i ph√≥ng b·ªô nh·ªõ cache")

class InferenceContext:
    """Context manager for automatic memory management"""
    def __enter__(self):
        clear_gpu_cache()
        return self
        
    def __exit__(self, exc_type, exc_val, exc_tb):
        clear_gpu_cache()

def optimize_inference(model, tokenizer, problem_text, max_new_tokens=50):
    """
    Optimized inference function with performance improvements
    """
    try:
        # 1. Optimize input processing
        inputs = tokenizer(
            problem_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(model.device)

        # 2. Enable optimizations
        with torch.inference_mode(), torch.amp.autocast('cuda'):
            outputs = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                # 3. Optimize generation parameters
                max_new_tokens=max_new_tokens,
                min_new_tokens=10,
                temperature=0.7,
                top_p=0.9,
                top_k=50,
                num_beams=1,
                do_sample=False,
                early_stopping=True,
                pad_token_id=model.config.eos_token_id,
                # 4. Enable optimization flags
                use_cache=True,
                repetition_penalty=1.0
            )
        
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return result

    except Exception as e:
        return f"L·ªói: {str(e)}"

def batch_inference(model, tokenizer, questions, batch_size=4):
    """
    Process multiple questions in batches for better performance
    """
    results = []
    for i in range(0, len(questions), batch_size):
        batch = questions[i:i + batch_size]
        inputs = tokenizer(
            batch, 
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(model.device)
        
        with torch.inference_mode():
            outputs = model.generate(**inputs)
            
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results.extend(decoded)
    return results

def fast_inference(model, tokenizer, question):
    """
    Fast inference wrapper with automatic memory management
    """
    with InferenceContext():
        # Clear cache before starting
        clear_gpu_cache()
        
        # Perform inference with optimizations
        result = optimize_inference(
            model,
            tokenizer,
            question,
            max_new_tokens=50
        )
        
        return result

In [55]:
# Function to call GPT API
def gpt_inference(problem_text):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": standard_prompt(problem_text)}],
            max_tokens=50
        )
        return response.choices[0].message['content'].strip()
    except Exception as e:
        return f"API Error: {str(e)}"

In [56]:
import google.generativeai as genai
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# ‚úÖ **Load m√¥ h√¨nh Gemini**
try:
    gemini_text_model = genai.GenerativeModel('gemini-1.5-flash-8b')  # Th·ª≠ model kh√°c n·∫øu l·ªói
    gemini_vision_model = genai.GenerativeModel('gemini-1.5-flash')
    print("‚úÖ Gemini models loaded successfully!")
except Exception as e:
    print(f"‚ùå L·ªói khi load Gemini: {e}")


‚úÖ Gemini models loaded successfully!


In [57]:
def gemini_inference_improved(problem_text, model_type='text'):
    """H√†m inference c·∫£i ti·∫øn cho m√¥ h√¨nh Gemini"""
    try:
        prompt = standard_prompt(problem_text, "math")
        
        if model_type == 'text':
            response = gemini_text_model.generate_content(
                prompt,
                generation_config={
                    "temperature": 0.7,
                    "max_output_tokens": 512,
                    "top_p": 0.95
                }
            )
        else:
            # L·ªói: problem_image kh√¥ng ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a
            # S·ª≠a: b·ªè tham chi·∫øu t·ªõi problem_image
            response = gemini_vision_model.generate_content(
                prompt,  # Ch·ªâ s·ª≠ d·ª•ng vƒÉn b·∫£n
                generation_config={
                    "temperature": 0.7,
                    "max_output_tokens": 512,
                    "top_p": 0.95
                }
            )
        
        return response.text.replace("**", "").strip()  # Clean formatting
    
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói Gemini: {str(e)}")
        return "Kh√¥ng th·ªÉ t·∫°o ph·∫£n h·ªìi t·ª´ Gemini"

In [58]:
def qwen_inference_improved(problem_text):
    """Improved inference function for Qwen model"""
    if qwen_model is None or qwen_tokenizer is None:
        return "L·ªói: Model ho·∫∑c tokenizer Qwen ch∆∞a ƒë∆∞·ª£c load"
    
    try:
        with InferenceContext():
            result = optimize_inference(
                qwen_model,
                qwen_tokenizer,
                problem_text,
                max_new_tokens=50
            )
            return result
    except Exception as e:
        return f"L·ªói Qwen: {str(e)}"


In [59]:

def llama_inference_improved(problem_text):
    """Improved inference function for Llama model"""
    if llama_model is None or llama_tokenizer is None:
        return "L·ªói: Model ho·∫∑c tokenizer Llama ch∆∞a ƒë∆∞·ª£c load"
    
    try:
        with InferenceContext():
            result = optimize_inference(
                llama_model,
                llama_tokenizer,
                problem_text,
                max_new_tokens=50
            )
            return result
    except Exception as e:
        return f"L·ªói Llama: {str(e)}"

In [60]:

print("\n" + "="*60)
print("üöÄ B∆Ø·ªöC 2: TH·ª¨ INFERENCE V·ªöI M√î H√åNH ƒê√É C·∫¢I THI·ªÜN")
print("="*60)


#%% [Example Usage]
problem1 = "ƒê·ªÅ b√†i: M·ªôt con tr√¢u ƒÉn c·ªè trong 5 ng√†y th√¨ h·∫øt m·ªôt ƒë√°m c·ªè. N·∫øu hai con tr√¢u ƒÉn th√¨ ch·ªâ ƒë·ªß trong 3 ng√†y. H·ªèi n·∫øu ba con tr√¢u ƒÉn th√¨ ƒë√°m c·ªè ƒë√≥ ƒë·ªß trong m·∫•y ng√†y?"
problem2 = "ƒê·ªÅ b√†i: C√≥ m·ªôt √¥ng l√£o mu·ªën chia b√≥ ƒë≈©a cho c√°c con sao cho: ‚Ä¢ N·∫øu chia cho 2 ng∆∞·ªùi th√¨ d∆∞ 1 chi·∫øc. ‚Ä¢ N·∫øu chia cho 3 ng∆∞·ªùi th√¨ d∆∞ 1 chi·∫øc. ‚Ä¢ N·∫øu chia cho 4, 5 ho·∫∑c 6 ng∆∞·ªùi th√¨ ƒë·ªÅu d∆∞ 1 chi·∫øc. ‚Ä¢ Nh∆∞ng n·∫øu chia cho 7 ng∆∞·ªùi th√¨ v·ª´a ƒë·ªß. H·ªèi b√≥ ƒë≈©a c√≥ bao nhi√™u chi·∫øc?"
problem3 = "ƒê·ªÅ b√†i: M·ªôt ng∆∞·ªùi g√°nh cam ƒëi b√°n, khi qua ch·ª£, √¥ng b√°n m·ªôt n·ª≠a s·ªë cam v√† th√™m n·ª≠a qu·∫£. ƒêi ti·∫øp m·ªôt ƒëo·∫°n, √¥ng l·∫°i b√°n m·ªôt n·ª≠a s·ªë cam c√≤n l·∫°i v√† th√™m n·ª≠a qu·∫£. Cu·ªëi c√πng, khi ƒë·∫øn ch·ª£ cu·ªëi c√πng, √¥ng l·∫°i b√°n m·ªôt n·ª≠a s·ªë cam c√≤n l·∫°i v√† th√™m n·ª≠a qu·∫£, h·∫øt s·∫°ch cam. H·ªèi ban ƒë·∫ßu √¥ng c√≥ bao nhi√™u qu·∫£ cam?"
problem4 = "ƒê·ªÅ b√†i: V·ª´a g√† v·ª´a ch√≥, B√≥ l·∫°i cho tr√≤n, Ba m∆∞∆°i s√°u con, M·ªôt trƒÉm ch√¢n ch·∫µn. H·ªèi c√≥ bao nhi√™u con g√†, bao nhi√™u con ch√≥?"

# Ch·∫°y inference v·ªõi m√¥ h√¨nh Gemini (th∆∞·ªùng ·ªïn ƒë·ªãnh nh·∫•t)
print("\nü§ñ ƒêang ch·∫°y inference v·ªõi Gemini...")
gemini_result = gemini_inference_improved(problem4)
print("\nüìù K·∫øt qu·∫£ Gemini:")
print(gemini_result)

# Th·ª≠ v·ªõi Llama
print("\nü§ñ ƒêang ch·∫°y inference v·ªõi Llama c·∫£i ti·∫øn...")
# Gi·∫£i ph√≥ng b·ªô nh·ªõ tr∆∞·ªõc khi ch·∫°y
clear_memory()
try:
    llama_result = llama_inference_improved(problem4)
    print("\nüìù K·∫øt qu·∫£ Llama:")
    print(llama_result)
except Exception as e:
    print(f"‚ùå Kh√¥ng th·ªÉ ch·∫°y inference v·ªõi Llama: {e}")

# Th·ª≠ v·ªõi Qwen
print("\nü§ñ ƒêang ch·∫°y inference v·ªõi Qwen c·∫£i ti·∫øn...")
# Gi·∫£i ph√≥ng b·ªô nh·ªõ tr∆∞·ªõc khi ch·∫°y
clear_memory()
try:
    qwen_result = qwen_inference_improved(problem4)
    print("\nüìù K·∫øt qu·∫£ Qwen:")
    print(qwen_result)
except Exception as e:
    print(f"‚ùå Kh√¥ng th·ªÉ ch·∫°y inference v·ªõi Qwen: {e}")

# T·ªïng k·∫øt
# Th√™m ki·ªÉm tra bi·∫øn
llama_result = "Kh√¥ng c√≥ k·∫øt qu·∫£" if 'llama_result' not in locals() else llama_result
qwen_result = "Kh√¥ng c√≥ k·∫øt qu·∫£" if 'qwen_result' not in locals() else qwen_result
gemini_result = "Kh√¥ng c√≥ k·∫øt qu·∫£" if 'gemini_result' not in locals() else gemini_result

# T·ªïng k·∫øt
print("\n" + "="*60)
print("‚úÖ HO√ÄN TH√ÄNH")
print("="*60)
print("üìä T√≥m t·∫Øt k·∫øt qu·∫£:")
print(f"- Gemini: {'Th√†nh c√¥ng' if 'Kh√¥ng th·ªÉ t·∫°o ph·∫£n h·ªìi t·ª´ Gemini' not in gemini_result else 'Th·∫•t b·∫°i'}")
print(f"- Llama: {'Th√†nh c√¥ng' if 'L·ªói Llama' not in llama_result else 'Th·∫•t b·∫°i'}")
print(f"- Qwen: {'Th√†nh c√¥ng' if 'L·ªói Qwen' not in qwen_result else 'Th·∫•t b·∫°i'}")


üöÄ B∆Ø·ªöC 2: TH·ª¨ INFERENCE V·ªöI M√î H√åNH ƒê√É C·∫¢I THI·ªÜN

ü§ñ ƒêang ch·∫°y inference v·ªõi Gemini...

üìù K·∫øt qu·∫£ Gemini:
Let 'g' be the number of chickens and 'h' be the number of dogs.

We have two equations:

1) g + h = 36  (Total animals)
2) 2g + 4h = 100 (Total legs)

From equation 1, we can express g as:

g = 36 - h

Substitute this into equation 2:

2(36 - h) + 4h = 100
72 - 2h + 4h = 100
2h = 28
h = 14

Now substitute the value of h back into the equation for g:

g = 36 - 14
g = 22

Therefore, there are $\boxed{22}$ chickens and $\boxed{14}$ dogs.

ü§ñ ƒêang ch·∫°y inference v·ªõi Llama c·∫£i ti·∫øn...
üßπ ƒê√£ gi·∫£i ph√≥ng b·ªô nh·ªõ cache

üìù K·∫øt qu·∫£ Llama:
L·ªói: Model ho·∫∑c tokenizer Llama ch∆∞a ƒë∆∞·ª£c load

ü§ñ ƒêang ch·∫°y inference v·ªõi Qwen c·∫£i ti·∫øn...
üßπ ƒê√£ gi·∫£i ph√≥ng b·ªô nh·ªõ cache
üßπ ƒê√£ gi·∫£i ph√≥ng b·ªô nh·ªõ cache




üßπ ƒê√£ gi·∫£i ph√≥ng b·ªô nh·ªõ cache

üìù K·∫øt qu·∫£ Qwen:
L·ªói: CUDA out of memory. Tried to allocate 2.32 GiB. GPU 0 has a total capacity of 47.99 GiB of which 0 bytes is free. Of the allocated memory 45.53 GiB is allocated by PyTorch, and 13.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

‚úÖ HO√ÄN TH√ÄNH
üìä T√≥m t·∫Øt k·∫øt qu·∫£:
- Gemini: Th√†nh c√¥ng
- Llama: Th√†nh c√¥ng
- Qwen: Th√†nh c√¥ng


In [61]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def safe_inference(model, tokenizer, text, model_name):
    """
    Safe inference wrapper with error handling and logging
    """
    try:
        if model is None or tokenizer is None:
            logger.error(f"{model_name}: Model or tokenizer not loaded")
            return f"L·ªói {model_name}: Model ch∆∞a ƒë∆∞·ª£c load"
            
        with InferenceContext():
            result = optimize_inference(
                model,
                tokenizer,
                text,
                max_new_tokens=50
            )
            logger.info(f"{model_name}: Inference successful")
            return result
            
    except Exception as e:
        logger.error(f"{model_name}: Error during inference - {str(e)}")
        return f"L·ªói {model_name}: {str(e)}"

In [62]:
def check_gpu_memory():
    """Check GPU memory before inference"""
    for i in range(torch.cuda.device_count()):
        total = torch.cuda.get_device_properties(i).total_memory / 1024**3
        allocated = torch.cuda.memory_allocated(i) / 1024**3
        free = total - allocated
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Total: {total:.2f} GB")
        print(f"  Allocated: {allocated:.2f} GB")
        print(f"  Free: {free:.2f} GB")

In [63]:
def cot_prompt(problem):
    return f"""You are a helpful assistant tasked with solving the following problem step by step. Explain your reasoning clearly at each stage and provide the final answer.

Problem: {problem}

Follow these steps to solve the problem:
1. **Restate the Problem**: Rephrase the problem in your own words to confirm understanding.
2. **Identify Key Variables**: Define the unknowns (e.g., 'let x be...') and list what needs to be solved.
3. **List Given Information**: Write down all provided data and any assumptions (e.g., units, implied conditions).
4. **Recall Relevant Concepts**: Mention any formulas, rules, or strategies (e.g., algebra, geometry) that apply.
5. **Break Down the Problem**: If complex, split it into smaller sub-problems or logical steps.
6. **Solve Step by Step**: Work through each part, showing all calculations and explaining your logic.
7. **Combine Results**: Integrate the solutions to sub-problems to find the overall answer.
8. **Verify the Solution**: Check the answer by substituting it back into the problem or using an alternative method. Explain why it makes sense.
9. **Present the Answer**: State the final answer clearly and concisely.

Focus on clarity and logical progression. If unsure, explore multiple approaches and select the best one."""