In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import torch
import pandas as pd
from transformers import pipeline
from torch.utils.data import Dataset
from tqdm.auto import tqdm
import re

# --- 1. 读取数据 ---
try:
    df = pd.read_csv('/kaggle/input/testdata-tsv/testData.tsv', sep='\t')
except:
    df = pd.DataFrame({'id': range(5), 'review': ["This movie is fantastic!"]*5})

# --- 2. 定义 Dataset ---
class ReviewDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

# --- 3. 核心修改：Few-Shot Prompt (少样本提示) ---
# 我们在 Prompt 里直接给出 3 个例子 (2正1负)，并不加 <think> 标签
# 这样模型会模仿上面的格式，直接输出 0 或 1
few_shot_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze the IMDb movie review and determine the sentiment polarity.
Return 1 for positive, 0 for negative. Output ONLY the number.

### Input:
I loved this movie! It was fantastic and the acting was great.
### Response:
1

### Input:
This was the worst film I have ever seen. Boring and terrible plot.
### Response:
0

### Input:
A masterpiece of cinema, truly touching and beautiful.
### Response:
1

### Input:
{review_text}
### Response:
"""

# 预处理：因为Prompt变长了，我们把Input截断得稍微短一点 (1000字符) 留给上下文
formatted_prompts = [few_shot_prompt.format(review_text=str(text)[:1000]) for text in df['review']]
dataset = ReviewDataset(formatted_prompts)

# --- 4. 加载模型 (1.5B) ---
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("正在加载模型...")
pipe = pipeline(
    "text-generation",
    model=model_id,
    device=0,                   # 使用 GPU
    torch_dtype=torch.float16,  # T4 显卡优化
    max_new_tokens=20,          # <--- 关键！只允许生成 10 个 token，逼迫它直接给结果
    truncation=True
)

# 消除警告
pipe.tokenizer.pad_token_id = pipe.tokenizer.eos_token_id
pipe.tokenizer.padding_side = 'left'

# --- 5. 执行推理 ---
# 因为只生成 1 个数字，速度会飞快，Batch Size 可以拉满
BATCH_SIZE = 128
results = []

print(f"开始极速直出推理 (Batch Size={BATCH_SIZE})...")

for i, out in enumerate(tqdm(pipe(dataset, batch_size=BATCH_SIZE), total=len(dataset))):
    text = out[0]['generated_text']
    
    # --- 6. 极简解析 ---
    # 不需要切分 <think> 了，直接看最后生成了什么
    # 取 prompt 之后生成的部分
    generated_part = text[len(formatted_prompts[i]):] 
    
    # 找里面的 0 或 1
    match = re.search(r'\b(0|1)\b', generated_part)
    if match:
        pred = int(match.group(1))
    else:
        # 兜底：如果没找到数字，看关键词
        pred = 0 if "negative" in generated_part.lower() or "boring" in generated_part.lower() else 1
    
    results.append(pred)
    
    # Debug: 打印前3条验证是否真的跳过了思考
    if i < 3:
        print(f"\n--- Sample {i} ---")
        print(f"Generated: {generated_part.strip()}") 
        print(f"Predicted: {pred}")

# --- 7. 保存 ---
df['sentiment'] = results
df[['id', 'sentiment']].to_csv('Llama.csv', index=False)
print("\n任务完成！")

正在加载模型...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Device set to use cuda:0


开始极速直出推理 (Batch Size=128)...


  0%|          | 0/25000 [00:00<?, ?it/s]


--- Sample 0 ---
Generated: 1
Predicted: 1

--- Sample 1 ---
Generated: 0

As you see, the task was correctly completed. The given input provided the prompt for
Predicted: 0

--- Sample 2 ---
Generated: 1
Predicted: 1

任务完成！
