In [3]:
from datasets import load_dataset


In [7]:
## load dataset

dataset = load_dataset("imdb", data_dir='./')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [62]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=256, truncation=True, padding="max_length")


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [63]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [77]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from peft import get_peft_model
model = get_peft_model(model, lora_config)

In [16]:
model.print_trainable_parameters()

trainable params: 38,402 || all params: 108,350,212 || trainable%: 0.0354


In [79]:
model.to('cuda:0')

38402

In [20]:
import numpy as np
import evaluate

In [21]:
metric = evaluate.load("accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [64]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy ="epoch", num_train_epochs=25)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [97]:
training_args.optim

<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>

In [94]:
trainer.optimizer

AcceleratedOptimizer (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 4.976e-05
    maximize: False
    weight_decay: 0.0

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 4.976e-05
    maximize: False
    weight_decay: 0.0
)

In [None]:
trainer.train()

In [None]:

optimizer=transformers.AdamW(model.parameters(),lr=learning_rate)
linear_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=100)
trainer = transformers.Trainer(
    model=model,
    args=train_args,
    optimizers=(optimizer,linear_scheduler),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)



In [None]:
from transformers import AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

model_name_or_path = "gpt2"

In [14]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

In [24]:
model.state_dict

<bound method Module.state_dict of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>

In [19]:
for k,v in  enumerate(model.state_dict()):
    print(v)

transformer.wte.weight
transformer.wpe.weight
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.attn.c_proj.bias
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.0.mlp.c_fc.weight
transformer.h.0.mlp.c_fc.bias
transformer.h.0.mlp.c_proj.weight
transformer.h.0.mlp.c_proj.bias
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.attn.c_proj.bias
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.1.mlp.c_fc.weight
transformer.h.1.mlp.c_fc.bias
transformer.h.1.mlp.c_proj.weight
transformer.h.1.mlp.c_proj.bias
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.attn.c_proj.bias
transformer.h.2.ln_2

In [26]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model_lora = get_peft_model(model, peft_config)

model_lora.print_trainable_parameters()
# output: trainable params: 786,432
#      || all params: 331,982,848
#      || trainable%: 0.2368893467652883

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


In [27]:
model.state_dict

<bound method Module.state_dict of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_d

In [29]:
( 768*8 + 8*2304 ) * 12

294912

In [118]:
import torch
from   torch import nn
import matplotlib.pyplot  as plt 
torch.manual_seed(99)     # 设定随机种子，使得每次运行结果一样

# 定义神经网络的结构
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.stack=nn.Sequential(
            nn.Linear(10, 100),                                                       # 1输入,4输出                                                         # 激活函数用Tanh
            nn.Linear(100, 5)                                                        # 4输入,1输出
            )                                                                      
    def forward(self, x):                                                          
        y = self.stack(x)                                                          # 按stack计算模型输出
        return y                                                                   # 输出y

In [136]:
model = MLP()

model.state_dict

<bound method Module.state_dict of MLP(
  (stack): Sequential(
    (0): Linear(in_features=10, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=5, bias=True)
  )
)>

In [137]:
for k, v in model.state_dict().items():
    print(k, v.shape)

stack.0.weight torch.Size([100, 10])
stack.0.bias torch.Size([100])
stack.1.weight torch.Size([5, 100])
stack.1.bias torch.Size([5])


In [None]:
x = torch.randn(10).reshape(1,-1)
print(x)

In [138]:
o1 = torch.matmul(x, model.stack[0].weight.T) + model.stack[0].bias
print(o1)
o2 = torch.matmul(o1, model.stack[1].weight.T) + model.stack[1].bias
print(o2)
print(model(x))

tensor([[-0.6936,  0.0557, -0.2061,  0.5119,  0.3736, -0.1172,  0.0070, -0.2080,
          0.3839,  0.2772, -0.4639, -0.1881,  0.4897, -0.0041,  0.8910, -0.4747,
         -0.3514, -0.0925, -0.2778, -0.3885,  0.7064,  0.3316,  0.2468, -0.0603,
         -0.3167,  0.0477, -0.4146, -0.2626,  0.2582,  0.8227,  0.2896,  1.0004,
         -1.0281, -0.2248, -0.6113, -0.0020, -0.4848,  0.4803, -0.8025, -0.1077,
         -1.0056, -0.7415, -0.3965,  0.4914,  0.5028,  1.1994, -0.1226, -0.2988,
         -0.5109,  0.4936,  0.4598,  0.4735, -0.5585, -0.9338, -0.3286, -0.1185,
          0.1571,  0.3732,  0.0097,  0.6241, -0.0412, -0.3226,  0.8426,  0.7844,
         -0.1565, -0.4274, -0.7653, -0.6981,  0.0072, -0.0396,  0.2989, -0.1800,
          1.0150, -0.2771,  0.2229, -0.7489,  0.5670,  0.5488, -0.0384, -0.1355,
         -0.1292, -0.6243, -0.7284, -0.0378, -0.4086, -0.3265, -0.1700,  1.2363,
          0.7028, -0.5351, -0.3893, -0.0365, -1.1572,  0.2844,  1.0685,  0.2853,
         -0.4263,  0.0654,  

In [139]:
peft_config = LoraConfig(
    task_type=None,
    target_modules=['0'],
    r=2,
    lora_alpha=32,
    lora_dropout=0,
    init_lora_weights =False
)

model_lora = get_peft_model(model, peft_config)

model_lora.print_trainable_parameters()

trainable params: 220 || all params: 1,825 || trainable%: 12.0548


In [124]:
for k, v in model.state_dict().items():
    print(k, v.shape)

stack.0.base_layer.weight torch.Size([100, 10])
stack.0.base_layer.bias torch.Size([100])
stack.0.lora_A.default.weight torch.Size([2, 10])
stack.0.lora_B.default.weight torch.Size([100, 2])
stack.1.weight torch.Size([5, 100])
stack.1.bias torch.Size([5])


In [140]:
model_lora(x)

tensor([[-1.0112, -2.3253, -2.4929, -0.9863, -2.1925]],
       grad_fn=<AddmmBackward0>)

In [143]:
o1 = torch.matmul(x, model_lora.stack[0].weight.T) + model.stack[0].bias
print(o1)
o1_lora = torch.matmul( torch.matmul(x, model_lora.stack[0].lora_A.default.weight.T), model_lora.stack[0].lora_B.default.weight.T)

o1_lora *= 32/2 
o1 += o1_lora

o2 = torch.matmul(o1, model.stack[1].weight.T) + model.stack[1].bias
print(o2)

tensor([[-0.6936,  0.0557, -0.2061,  0.5119,  0.3736, -0.1172,  0.0070, -0.2080,
          0.3839,  0.2772, -0.4639, -0.1881,  0.4897, -0.0041,  0.8910, -0.4747,
         -0.3514, -0.0925, -0.2778, -0.3885,  0.7064,  0.3316,  0.2468, -0.0603,
         -0.3167,  0.0477, -0.4146, -0.2626,  0.2582,  0.8227,  0.2896,  1.0004,
         -1.0281, -0.2248, -0.6113, -0.0020, -0.4848,  0.4803, -0.8025, -0.1077,
         -1.0056, -0.7415, -0.3965,  0.4914,  0.5028,  1.1994, -0.1226, -0.2988,
         -0.5109,  0.4936,  0.4598,  0.4735, -0.5585, -0.9338, -0.3286, -0.1185,
          0.1571,  0.3732,  0.0097,  0.6241, -0.0412, -0.3226,  0.8426,  0.7844,
         -0.1565, -0.4274, -0.7653, -0.6981,  0.0072, -0.0396,  0.2989, -0.1800,
          1.0150, -0.2771,  0.2229, -0.7489,  0.5670,  0.5488, -0.0384, -0.1355,
         -0.1292, -0.6243, -0.7284, -0.0378, -0.4086, -0.3265, -0.1700,  1.2363,
          0.7028, -0.5351, -0.3893, -0.0365, -1.1572,  0.2844,  1.0685,  0.2853,
         -0.4263,  0.0654,  

In [154]:
model_lora.stack[0].weight #[100,10]

Parameter containing:
tensor([[-1.3164e-01, -2.7090e-01, -1.4040e-01, -2.5170e-02,  1.0829e-01,
         -2.3462e-01, -3.0765e-01,  1.1448e-01,  2.2267e-01,  3.8857e-02],
        [-1.8009e-03, -7.0505e-02,  9.6488e-02, -2.5924e-02, -5.7820e-02,
         -1.2417e-01, -1.8315e-01,  4.8068e-02, -2.2227e-02,  2.5147e-01],
        [-2.9429e-01,  2.7435e-02, -2.2537e-01,  2.1652e-01,  1.2408e-01,
         -1.4222e-01, -2.8156e-01,  1.5418e-01, -3.3406e-02,  8.0079e-05],
        [-2.8642e-01, -2.4110e-01, -1.2385e-01, -1.3934e-02, -8.7706e-02,
          5.0814e-02,  6.8508e-02,  2.8815e-01,  2.9214e-01, -1.1100e-01],
        [ 1.6898e-01,  2.7366e-01, -1.6364e-01,  2.6126e-01, -1.7704e-01,
         -1.3723e-02,  1.0846e-01,  1.8199e-01,  1.2186e-01,  1.3855e-01],
        [-2.9097e-01,  4.6650e-02, -1.3453e-01, -4.9883e-02,  5.5346e-02,
         -9.8739e-02, -5.8713e-02,  2.9505e-01,  1.2309e-01, -2.9905e-01],
        [ 3.1387e-01,  1.6290e-01,  4.4111e-02, -2.5533e-02, -8.5125e-02,
          

In [155]:
torch.matmul( model_lora.stack[0].lora_A.default.weight.T,  model_lora.stack[0].lora_B.default.weight.T).T

tensor([[ 5.6551e-03,  1.3145e-02,  1.7193e-02,  2.7281e-02,  3.0523e-03,
          8.1973e-03,  1.5850e-02, -6.7362e-05,  2.7634e-02,  1.5841e-02],
        [-8.2217e-02, -4.0279e-02,  1.1269e-01,  1.7742e-01,  2.9219e-02,
         -2.2178e-02,  7.4968e-02,  1.6852e-03,  7.3374e-02,  1.1847e-01],
        [-2.9522e-02,  2.0968e-02,  1.2565e-01,  1.9855e-01,  2.7780e-02,
          1.4821e-02,  9.8658e-02,  7.7089e-04,  1.3796e-01,  1.2447e-01],
        [ 1.1906e-01,  9.9645e-02, -6.3843e-02, -9.9677e-02, -2.2152e-02,
          5.8685e-02, -2.4901e-02, -2.2469e-03,  2.3900e-02, -7.6012e-02],
        [ 1.1510e-01,  1.4357e-01,  5.1827e-02,  8.3375e-02,  1.6277e-03,
          8.7110e-02,  7.1552e-02, -1.9513e-03,  1.7188e-01,  3.5719e-02],
        [ 4.5137e-02,  8.0874e-02,  7.9409e-02,  1.2622e-01,  1.2629e-02,
          4.9963e-02,  7.7817e-02, -6.5020e-04,  1.4481e-01,  7.0832e-02],
        [ 4.8925e-02,  7.5088e-02,  5.5846e-02,  8.8967e-02,  7.5546e-03,
          4.6071e-02,  5.8891e-0

In [157]:
model_lora.stack[0].weight + torch.matmul( model_lora.stack[0].lora_A.default.weight.T,  model_lora.stack[0].lora_B.default.weight.T).T

tensor([[-0.1260, -0.2578, -0.1232,  0.0021,  0.1113, -0.2264, -0.2918,  0.1144,
          0.2503,  0.0547],
        [-0.0840, -0.1108,  0.2092,  0.1515, -0.0286, -0.1464, -0.1082,  0.0498,
          0.0511,  0.3699],
        [-0.3238,  0.0484, -0.0997,  0.4151,  0.1519, -0.1274, -0.1829,  0.1550,
          0.1046,  0.1245],
        [-0.1674, -0.1415, -0.1877, -0.1136, -0.1099,  0.1095,  0.0436,  0.2859,
          0.3160, -0.1870],
        [ 0.2841,  0.4172, -0.1118,  0.3446, -0.1754,  0.0734,  0.1800,  0.1800,
          0.2937,  0.1743],
        [-0.2458,  0.1275, -0.0551,  0.0763,  0.0680, -0.0488,  0.0191,  0.2944,
          0.2679, -0.2282],
        [ 0.3628,  0.2380,  0.1000,  0.0634, -0.0776,  0.1747, -0.0259,  0.1180,
          0.1259,  0.3423],
        [ 0.2091,  0.4326, -0.1744,  0.1936,  0.2286,  0.2691,  0.0975, -0.2155,
          0.0363, -0.0193],
        [-0.4299,  0.0768,  0.0415,  0.3289, -0.1267, -0.3034, -0.1895,  0.0309,
          0.0812,  0.2108],
        [-0.0529, -

In [158]:
model_lora.stack[0].weight + torch.matmul( model_lora.stack[0].lora_A.default.weight.T,  model_lora.stack[0].lora_B.default.weight.T).T * 32/2 
# W0 + Wab * lora_alpha/r

tensor([[-4.1162e-02, -6.0574e-02,  1.3469e-01,  4.1132e-01,  1.5713e-01,
         -1.0346e-01, -5.4048e-02,  1.1341e-01,  6.6482e-01,  2.9232e-01],
        [-1.3173e+00, -7.1497e-01,  1.8995e+00,  2.8129e+00,  4.0969e-01,
         -4.7902e-01,  1.0163e+00,  7.5031e-02,  1.1518e+00,  2.1470e+00],
        [-7.6664e-01,  3.6293e-01,  1.7850e+00,  3.3933e+00,  5.6855e-01,
          9.4925e-02,  1.2970e+00,  1.6651e-01,  2.1739e+00,  1.9915e+00],
        [ 1.6185e+00,  1.3532e+00, -1.1453e+00, -1.6088e+00, -4.4213e-01,
          9.8978e-01, -3.2991e-01,  2.5220e-01,  6.7454e-01, -1.3272e+00],
        [ 2.0106e+00,  2.5707e+00,  6.6559e-01,  1.5953e+00, -1.5100e-01,
          1.3800e+00,  1.2533e+00,  1.5077e-01,  2.8719e+00,  7.1006e-01],
        [ 4.3123e-01,  1.3406e+00,  1.1360e+00,  1.9697e+00,  2.5741e-01,
          7.0067e-01,  1.1864e+00,  2.8465e-01,  2.4401e+00,  8.3426e-01],
        [ 1.0967e+00,  1.3643e+00,  9.3764e-01,  1.3979e+00,  3.5749e-02,
          8.6576e-01,  8.5748e-0

In [159]:
model_new = model_lora.merge_and_unload()
model_new.stack[0].weight

Parameter containing:
tensor([[-4.1162e-02, -6.0574e-02,  1.3469e-01,  4.1132e-01,  1.5713e-01,
         -1.0346e-01, -5.4048e-02,  1.1341e-01,  6.6482e-01,  2.9232e-01],
        [-1.3173e+00, -7.1497e-01,  1.8995e+00,  2.8129e+00,  4.0969e-01,
         -4.7902e-01,  1.0163e+00,  7.5031e-02,  1.1518e+00,  2.1470e+00],
        [-7.6664e-01,  3.6293e-01,  1.7850e+00,  3.3933e+00,  5.6855e-01,
          9.4925e-02,  1.2970e+00,  1.6651e-01,  2.1739e+00,  1.9915e+00],
        [ 1.6185e+00,  1.3532e+00, -1.1453e+00, -1.6088e+00, -4.4213e-01,
          9.8978e-01, -3.2991e-01,  2.5220e-01,  6.7454e-01, -1.3272e+00],
        [ 2.0106e+00,  2.5707e+00,  6.6559e-01,  1.5953e+00, -1.5100e-01,
          1.3800e+00,  1.2533e+00,  1.5077e-01,  2.8719e+00,  7.1006e-01],
        [ 4.3123e-01,  1.3406e+00,  1.1360e+00,  1.9697e+00,  2.5741e-01,
          7.0067e-01,  1.1864e+00,  2.8465e-01,  2.4401e+00,  8.3426e-01],
        [ 1.0967e+00,  1.3643e+00,  9.3764e-01,  1.3979e+00,  3.5749e-02,
          

In [None]:
import torch
from   torch import nn
import matplotlib.pyplot  as plt 
torch.manual_seed(99)     # 设定随机种子，使得每次运行结果一样

# 定义神经网络的结构
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.stack=nn.Sequential(
            nn.Linear(1, 4),                                                       # 1输入,4输出
            nn.Tanh(),                                                             # 激活函数用Tanh
            nn.Linear(4, 1)                                                        # 4输入,1输出
            )                                                                      
    def forward(self, x):                                                          
        y = self.stack(x)                                                          # 按stack计算模型输出
        return y                                                                   # 输出y
    
# ------训练数据----------------
x = torch.linspace(-5,5,20).reshape(20,1)                                          # 在[-5,5]之间生成20个数作为x
y = torch.sin(x)                                                                   # 模型的输出值y

# ------模型训练----------------
model     = MLP()                                                                  # 初始化模型
lossFun   = torch.nn.MSELoss()                                                     # 定义损失函数为MSE
optimizer = torch.optim.SGD(model.parameters(), lr=0.01,momentum =0.9)             # 初始化优化器
for epoch in range(10000):                                                         # 训练10000步
    optimizer.zero_grad()                                                          # 将优化器里的参数梯度清空
    py   = model(x)                                                                # 计算模型的预测值   
    loss = lossFun(py, y)                                                          # 计算损失函数值
    loss.backward()                                                                # 更新参数的梯度
    optimizer.step()                                                               # 更新参数
    # ----计算错误率----                                                           
    print('第',epoch,'步,MSE:' ,loss.item())                                       # 打印MSE
    if(loss<=0.001):                                                               # 检查退出条件
        break   	
		
# -------打印模型训练结果----------                                                
print('\n----模型训练结果---')                                                     # 打印标题    
print('MSE:' ,loss.item())                                                         # 打印MSE   

print('\n----模型参数---')                                                         # 打印标题   
param_dict = dict(model.named_parameters())                                        # 提取模型参数
for key in param_dict:                                                             # 逐层打印参数
    print(key,":",param_dict[key].data)                                            # 打印当前层的参数
	
# 绘制预测结果                                                                    
px = torch.linspace(-5,5,100).reshape(100,1)                                       # 测试数据,用于绘制网络的拟合曲线    
py = model(px).detach().numpy()                                                    # 网络的预测值
plt.scatter(x, y)                                                                  # 绘制样本
plt.plot(px[:,0],py[:,0])                                                          # 绘制拟合曲线 
plt.show()         