# **GAI Project 2.b Text summarization**

## 環境設置

In [1]:
! pip install transformers
! pip install datasets
! pip install torcheval
! pip install pytorch-ignite
! pip install evaluate rouge_score
! pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [2]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from ignite.metrics import Rouge
import re
import warnings
import os
from rouge_score import rouge_scorer
import evaluate
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
warnings.filterwarnings("ignore")

## 引入rouge

In [3]:
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## 資料前處理

In [4]:
# 載入資料集
data = load_dataset("hugcyp/LCSTS", cache_dir="./cache/")
dataset_ratio = 0.01

# 切割資料
train_size = int(len(data["train"])*dataset_ratio)
validation_size = int(len(data["validation"])*dataset_ratio)
test_size = int(len(data["test"])*dataset_ratio)

data["train"] = data["train"].select(range(train_size))
data["validation"] = data["validation"].select(range(validation_size))
data["test"] = data["test"].select(range(test_size))

data

Downloading data:   0%|          | 0.00/903M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2400591 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8685 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'text'],
        num_rows: 24005
    })
    validation: Dataset({
        features: ['summary', 'text'],
        num_rows: 86
    })
    test: Dataset({
        features: ['summary', 'text'],
        num_rows: 7
    })
})

## T5

### 導入模型

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration                              # T5 model and tokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq     # for trainer

In [6]:
# https://huggingface.co/Langboat/mengzi-t5-base
t5_model_checkpoint="Langboat/mengzi-t5-base"

t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_checkpoint, cache_dir="./cache/")
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_checkpoint, cache_dir="./cache/").to(device)

spiece.model:   0%|          | 0.00/725k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

### 資料做tokenized

In [7]:
comment = "总结："

def t5_tokenize(batch):
    texts = [comment + doc for doc in batch["text"]]

    # 做token的動作
    tokenized = t5_tokenizer(texts, max_length=128, truncation=True)
    tokenized_outputs = t5_tokenizer(text_target=batch["summary"], max_length=32, truncation=True)
    tokenized["labels"] = tokenized_outputs["input_ids"]

    return tokenized

In [8]:
t5_tokenized_dataset = data.map(t5_tokenize, batched=True)
t5_tokenized_dataset

Map:   0%|          | 0/24005 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 24005
    })
    validation: Dataset({
        features: ['summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 86
    })
    test: Dataset({
        features: ['summary', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7
    })
})

In [9]:
for i in range(5):
	example = t5_tokenized_dataset['train'][i]
	print(f"\nSample {i}")
	print("text: ", example['text'])
	print("summary: ", example['summary'])
	print("decode_text: ", t5_tokenizer.decode(example['input_ids']))
	print("decode_summary: ", t5_tokenizer.decode(example['labels']))


Sample 0
text:  新华社受权于18日全文播发修改后的《中华人民共和国立法法》，修改后的立法法分为“总则”“法律”“行政法规”“地方性法规、自治条例和单行条例、规章”“适用与备案审查”“附则”等6章，共计105条。
summary:  修改后的立法法全文公布
decode_text:  总结:新华社受权于18日全文播发修改后的《中华人民共和国立法法》,修改后的立法法分为“总则”“法律”“行政法规”“地方性法规、自治条例和单行条例、规章”“适用与备案审查”“附则”等6章,共计105条。</s>
decode_summary:  修改后的立法法全文公布</s>

Sample 1
text:  一辆小轿车，一名女司机，竟造成9死24伤。日前，深圳市交警局对事故进行通报：从目前证据看，事故系司机超速行驶且操作不当导致。目前24名伤员已有6名治愈出院，其余正接受治疗，预计事故赔偿费或超一千万元。
summary:  深圳机场9死24伤续：司机全责赔偿或超千万
decode_text:  总结:一辆小轿车,一名女司机,竟造成9死24伤。日前,深圳市交警局对事故进行通报:从目前证据看,事故系司机超速行驶且操作不当导致。目前24名伤员已有6名治愈出院,其余正接受治疗,预计事故赔偿费或超一千万元。</s>
decode_summary:  深圳机场9死24伤续:司机全责赔偿或超千万</s>

Sample 2
text:  1月18日，习近平总书记对政法工作作出重要指示：2014年，政法战线各项工作特别是改革工作取得新成效。新形势下，希望全国政法机关主动适应新形势，为公正司法和提高执法司法公信力提供有力制度保障。
summary:  孟建柱：主动适应形势新变化提高政法机关服务大局的能力
decode_text:  总结:1月18日,习近平总书记对政法工作作出重要指示:2014年,政法战线各项工作特别是改革工作取得新成效。新形势下,希望全国政法机关主动适应新形势,为公正司法和提高执法司法公信力提供有力制度保障。</s>
decode_summary:  孟建柱:主动适应形势新变化提高政法机关服务大局的能力</s>

Sample 3
text:  针对央视3·15晚会曝光的电信行业乱象，工信部在公告中表示，将严查央视3·15晚会曝光通信违规违法行为。工信部称，已约谈三大

### trainer設置

In [10]:
t5_data_collator = DataCollatorForSeq2Seq(tokenizer=t5_tokenizer, model=t5_model)

In [11]:
def t5_compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = t5_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels!=-100, labels, t5_tokenizer.pad_token_id)
    decoded_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    gen_len = [np.count_nonzero(pred!=t5_tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(gen_len)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
t5_training_args = Seq2SeqTrainingArguments(
    report_to="none",
    output_dir="./saved_models",
    evaluation_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    metric_for_best_model="rougeL",
    fp16=True,
    predict_with_generate=True,
)

In [13]:
t5_trainer = Seq2SeqTrainer(
    model=t5_model,
    args=t5_training_args,
    data_collator=t5_data_collator,
    train_dataset=t5_tokenized_dataset["train"],
    eval_dataset=t5_tokenized_dataset["validation"],
    tokenizer=t5_tokenizer,
    compute_metrics=t5_compute_metrics,
)

In [14]:
t5_trainer.train()
t5_trainer.save_model("t5_result")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.7047,4.09517,0.0674,0.0194,0.0698,0.0684,10.4767
2,2.7744,4.210754,0.0756,0.0194,0.0791,0.0779,10.2442
3,1.9842,4.554506,0.0826,0.0194,0.0849,0.0849,10.9186
4,1.42,4.845899,0.0767,0.0271,0.0767,0.0767,10.814
5,1.0646,5.004747,0.0942,0.031,0.0977,0.0977,10.7674


In [15]:
t5_trainer.evaluate()

{'eval_loss': 5.004746913909912,
 'eval_rouge1': 0.0942,
 'eval_rouge2': 0.031,
 'eval_rougeL': 0.0977,
 'eval_rougeLsum': 0.0977,
 'eval_gen_len': 10.7674,
 'eval_runtime': 3.7991,
 'eval_samples_per_second': 22.637,
 'eval_steps_per_second': 1.579,
 'epoch': 5.0}

### generate結果

In [16]:
def t5_generator(text, t5_model, t5_tokenizer):
    text = "总结： " + text
    tokenized_text = t5_tokenizer.encode(text, return_tensors="pt").to(device)
    summary_ids = t5_model.generate(tokenized_text,
                                 num_beams=4,
                                 no_repeat_ngram_size=3,
                                 min_length=8,
                                 max_length=32,
                                 length_penalty=2.0,
                                 temperature=0.8
    )
    output = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

In [17]:
type = 'test'
if len(data[type]) > 10:
  test_case=int(len(data[type])/5)
else:
  test_case=len(data[type])
for i in range(test_case):
    text = data[type][i]['text']
    summary = data[type][i]['summary']
    pred_summary = t5_generator(text, t5_model, t5_tokenizer)

    print(f"\nSample {i}")
    print("Text: ", text)
    print("Summary : ", summary)
    print("Pred_ans: ", pred_summary)


Sample 0
Text:  本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代人
Summary :  
Pred_ans:  十大可穿戴设计原则最吸引眼球的十个可穿戴产品设计原则

Sample 1
Text:  2007年乔布斯向人们展示iPhone并宣称“它将会改变世界”，还有人认为他在夸大其词，然而在8年后，以iPhone为代表的触屏智能手机已经席卷全球各个角落。未来，智能手机将会成为“真正的个人电脑”，为人类发展做出更大的贡献。
Summary :  
Pred_ans:  “真正的个人电脑”将会改变世界吗?

Sample 2
Text:  雅虎发布2014年第四季度财报，并推出了免税方式剥离其持有的阿里巴巴集团15％股权的计划，打算将这一价值约400亿美元的宝贵投资分配给股东。截止发稿前，雅虎股价上涨了大约7％，至51.45美元。
Summary :  
Pred_ans:  雅虎出售15%股权以“换股”的方式出售阿里巴巴15%股权

Sample 3
Text:  2014年，51信用卡管家跟宜信等P2P公司合作，推出线上信贷产品“瞬时贷”，其是一种纯在线操作的信贷模式。51信用卡管家创始人孙海涛说，51目前每天放贷1000万，预计2015年，自营产品加上瞬时贷，放贷额度将远超30亿。
Summary :  
Pred_ans:  51信用卡管家:每天放贷1000万额度超30亿

Sample 4
Text:  目前世界上有着几百种编程语言，我应该学哪个?如何选择“正确”的编程语言进行学习?我所学的语言日后能否成为我获取好生活的保障?在这个问题上，很多人都曾经给出了他们都看法。但在我看来，这个问题答案其实非常简单：那就是JavaScript。
Summary :  
Pred_ans:  如何学习JavaScript

Sample 5
Text:  受众在哪里，媒体就应该在哪里，媒体的体制、内容、技术就应该向哪里转变。媒体融合关键是以人为本，即满足大众的信息需求，为受众提供更优质的服务。这就要求媒体在融合发展的过程中，既注重技术创新，又注重用户体验。
Summary :  
Pred_ans

## GPT2

### 導入模型

In [18]:
from transformers import GPT2LMHeadModel, BertTokenizer                                  # Gpt2 model and tokenizer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling     # for trainer

In [19]:
# https://huggingface.co/uer/gpt2-chinese-cluecorpussmall
GPT2_model_checkpoint = "uer/gpt2-distil-chinese-cluecorpussmall"

gpt2_model = GPT2LMHeadModel.from_pretrained(GPT2_model_checkpoint, cache_dir="./cache/").to(device)
gpt2_tokenizer = BertTokenizer.from_pretrained(GPT2_model_checkpoint, cache_dir="./cache/")

config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/244M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### 資料做tokenized

In [20]:
comment = "总结："

def gpt2_tokenize(batch):
    texts = []

    # 將text跟summary串接在一起
    for text in batch["text"]:
        concatenated_text = text + comment
        texts.append(concatenated_text)

    # 做token的動作
    tokenized = gpt2_tokenizer(texts, batch["summary"], padding='max_length', max_length=160, truncation=True)
    tokenized_outputs = gpt2_tokenizer(batch["summary"], padding='max_length', max_length=32, truncation=True)
    tokenized["labels"] = tokenized_outputs["input_ids"]

    return tokenized

In [21]:
gpt2_tokenized_dataset = data.map(gpt2_tokenize, batched=True)
gpt2_tokenized_dataset

Map:   0%|          | 0/24005 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 24005
    })
    validation: Dataset({
        features: ['summary', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 86
    })
    test: Dataset({
        features: ['summary', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 7
    })
})

In [22]:
for i in range(5):
	example = gpt2_tokenized_dataset['train'][i]
	print(f"\nSample {i}")
	print("text: ", example['text'])
	print("summary: ", example['summary'])
	print("decode_text: ", gpt2_tokenizer.decode(example['input_ids']))
	print("decode_summary: ", gpt2_tokenizer.decode(example['labels']))


Sample 0
text:  新华社受权于18日全文播发修改后的《中华人民共和国立法法》，修改后的立法法分为“总则”“法律”“行政法规”“地方性法规、自治条例和单行条例、规章”“适用与备案审查”“附则”等6章，共计105条。
summary:  修改后的立法法全文公布
decode_text:  [CLS] 新 华 社 受 权 于 18 日 全 文 播 发 修 改 后 的 《 中 华 人 民 共 和 国 立 法 法 》 ， 修 改 后 的 立 法 法 分 为 [UNK] 总 则 [UNK] [UNK] 法 律 [UNK] [UNK] 行 政 法 规 [UNK] [UNK] 地 方 性 法 规 、 自 治 条 例 和 单 行 条 例 、 规 章 [UNK] [UNK] 适 用 与 备 案 审 查 [UNK] [UNK] 附 则 [UNK] 等 6 章 ， 共 计 105 条 。 总 结 ： [SEP] 修 改 后 的 立 法 法 全 文 公 布 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
decode_summary:  [CLS] 修 改 后 的 立 法 法 全 文 公 布 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

Sample 1
text:  一辆小轿车，一名女司机，竟造成9死24伤。日前，深圳市交警局对事故进行通报：从目前证据看，事故系司机超速行驶且操作不当导致。目前24名伤员已有6名治愈出院，其余正接受治疗，预计

### trainer設置

In [23]:
gpt2_data_collator = DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False)

In [24]:
def gpt2_compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=-1)
    decoded_preds = gpt2_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels!=-100, labels, gpt2_tokenizer.pad_token_id)
    decoded_labels = gpt2_tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    gen_len = [np.count_nonzero(pred!=gpt2_tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(gen_len)

    return {k: round(v, 4) for k, v in result.items()}

In [25]:
gpt2_training_args = TrainingArguments(
    report_to="none",
    output_dir="./saved_models",
    evaluation_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    metric_for_best_model="rougeL",
)

In [26]:
gpt2_trainer = Trainer(
    model=gpt2_model,
    args=gpt2_training_args,
    data_collator=gpt2_data_collator,
    train_dataset=gpt2_tokenized_dataset["train"],
    eval_dataset=gpt2_tokenized_dataset["validation"],
    tokenizer=gpt2_tokenizer,
    compute_metrics=gpt2_compute_metrics,
)

In [27]:
gpt2_trainer.train()
gpt2_trainer.save_model("gpt2_result")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.7555,2.652003,0.0662,0.0112,0.066,0.0654,160.0
2,2.5852,2.65208,0.0888,0.0188,0.0856,0.0841,160.0
3,2.4528,2.659922,0.0775,0.0159,0.073,0.0727,160.0
4,2.349,2.671685,0.082,0.0164,0.0791,0.0789,160.0
5,2.2672,2.684328,0.0751,0.0158,0.0723,0.0724,160.0


In [28]:
gpt2_trainer.evaluate()

{'eval_loss': 2.6843278408050537,
 'eval_rouge1': 0.0751,
 'eval_rouge2': 0.0158,
 'eval_rougeL': 0.0723,
 'eval_rougeLsum': 0.0724,
 'eval_gen_len': 160.0,
 'eval_runtime': 1.9239,
 'eval_samples_per_second': 44.702,
 'eval_steps_per_second': 3.119,
 'epoch': 5.0}

### generate結果

In [29]:
def gpt2_generator(summary, text, gpt2_model, gpt2_tokenizer):
    comment = "总结："
    text = text + comment + gpt2_tokenizer.sep_token
    tokenized_text = gpt2_tokenizer.encode(text, return_tensors="pt").to(device)
    summary_ids = gpt2_model.generate(tokenized_text,
                                      num_beams=4,
                                      no_repeat_ngram_size=3,
                                      min_length=8,
                                      max_length=160,
                                      length_penalty=2.0,
                                      temperature=0.8,
                                      pad_token_id=gpt2_tokenizer.pad_token_id  # Use EOS token ID for padding
                                      )

    # Decode the generated summary
    output = gpt2_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    output = "".join(output.split())
    sep_index = output.find(comment)
    if sep_index != -1:
      output = output[sep_index + len(comment):]
    return output

In [30]:
type = 'test'
if len(data[type]) > 10:
  test_case=int(len(data[type])/5)
else:
  test_case=len(data[type])
for i in range(test_case):
    text = data[type][i]['text']
    summary = data[type][i]['summary']
    pred_summary = gpt2_generator(summary, text, gpt2_model, gpt2_tokenizer)

    print(f"\nSample {i}")
    print("Text: ", text)
    print("Summary : ", summary)
    print("Pred_ans: ", pred_summary)



Sample 0
Text:  本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代人
Summary :  
Pred_ans:  大数据时代，你应该知道的10个关键词10个常用的10大常用技能十个常见的10种常用错误总结10大

Sample 1
Text:  2007年乔布斯向人们展示iPhone并宣称“它将会改变世界”，还有人认为他在夸大其词，然而在8年后，以iPhone为代表的触屏智能手机已经席卷全球各个角落。未来，智能手机将会成为“真正的个人电脑”，为人类发展做出更大的贡献。
Summary :  
Pred_ans:  ，苹果的未来苹果将会是什么样子？揭秘苹果未来的苹果总结苹果：未来？

Sample 2
Text:  雅虎发布2014年第四季度财报，并推出了免税方式剥离其持有的阿里巴巴集团15％股权的计划，打算将这一价值约400亿美元的宝贵投资分配给股东。截止发稿前，雅虎股价上涨了大约7％，至51.45美元。
Summary :  
Pred_ans:  雅虎全面收缩免税业务，阿里将收购雅虎30%股权阿里股价暴涨：阿里收购阿里10%股份这一消息是怎样炼成的？？雅马哈

Sample 3
Text:  2014年，51信用卡管家跟宜信等P2P公司合作，推出线上信贷产品“瞬时贷”，其是一种纯在线操作的信贷模式。51信用卡管家创始人孙海涛说，51目前每天放贷1000万，预计2015年，自营产品加上瞬时贷，放贷额度将远超30亿。
Summary :  
Pred_ans:  51信贷卡秒贷：用户可以在线上随时查询到你的信用数据50万用户下单50亿用户上线信贷业务秒时贷

Sample 4
Text:  目前世界上有着几百种编程语言，我应该学哪个?如何选择“正确”的编程语言进行学习?我所学的语言日后能否成为我获取好生活的保障?在这个问题上，很多人都曾经给出了他们都看法。但在我看来，这个问题答案其实非常简单：那就是JavaScript。
Summary :  
Pred_ans:  如何学好编程？如何系统地学好编程浅谈语言的学习方法总结

Sample 5
Text:  受众在哪里，媒体就应该在哪里，媒体的体制