In [242]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader 
from torch.autograd import Variable
import matplotlib.pyplot as plt 
%matplotlib inline

### 参数

In [265]:
dir_path = "../../Problems of Data/Car insurance risk"
train_path = "/train.csv"
test_path = "/test.csv"
submission_path = "/submission.csv"
init_epochs = 10
init_batch_size = 128
init_learning_rate = 1e-4
init_momentum = 0.99
init_feature_size = 111

### 画图 https://matplotlib.org/api/animation_api.html
1. matplotlib.animation.FuncAnimation(
    fig, 
    animate, 
    init_func=init, 
    frames=100, 
    interval=20, 
    blit=true) 函数可以实现动态画图
2. fig：画布, animate: 随时间变化的数值函数, init_func: 初始化设定, frames:时间t的个数, interval:图的刷新频率, blit: 只修改变化的部分，加快绘图速度
  

In [244]:
# from matplotlib.animation import FuncAnimation
# import seaborn as sns # 美化图形
# fig, ax = plt.subplots()
# def init():
#     pass
# ani = FuncAnimation(fig=fig, )
# plt.show()

### 读取数据

In [245]:
def read_data(path): # 注意异常处理
    if os.path.exists:
        return pd.read_csv(path)
    else:
        print("error： the file path is not exist！")
        return None

### tips：处理特征
1. 对于数值数据，需要归一化处理
2. 对于非数值数据，需要进行编码处理 （one-hot编码）

In [246]:
def features_onehot(data): # 对pd.dataframe类型数据处理
    if type(data) is not pd.core.frame.DataFrame:
        print("数据格式不正确，无法处理数据!")
        return None
    cols = [x for x in data.columns if type(data[x][0]) is str]
    data = pd.get_dummies(data, columns=cols)
    return data

### 定义线性模型

In [247]:
class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super(LinearRegression, self).__init__()
        self.lr = nn.Sequential(
            nn.Linear(input_size, 1) # 线性回归是特殊的神经网络
        )
    def forward(self, x):
        x = self.lr(x)
        return x

### 数据集整理

In [248]:
# 定义数据集
class Get_Dataset(Dataset):
    def __init__(self, path):
        self.data = features_onehot(read_data(path)).values
    def __getitem__(self, index):
        return self.data[index][2:], self.data[index][1]
    def __len__(self):
        return len(self.data)
# 获取数据集和批数据集
def get_data(path):
    train_dataset = Get_Dataset(path)
    train_load = DataLoader(train_dataset, shuffle=True, batch_size=init_batch_size)
    return train_dataset, train_load

### 训练

In [264]:
# 构建模型
if torch.cuda.is_available:
    model = LinearRegression(init_feature_size).cuda()
else:
    model = LinearRegression(init_feature_size)
# 定义损失函数
loss_function = nn.MSELoss(reduction="sum") # 不取平均
# 定义优化策略
optimizer = optim.SGD(model.parameters(), lr=init_learning_rate, momentum=init_momentum)
# 加载训练数据集
train_dataset, train_load = get_data(dir_path+train_path)
# 加载测试数据
test_dataset,_ = get_data(dir_path+test_path)
# len(train_load.__iter__().__next__()[0][0])

In [273]:
import time
for epoch in range(init_epochs):
    last_time = time.time()
    train_loss = 0.0
    train_acc = 0.0
    print("*"*20)
    for (i, data) in enumerate(train_load, 1):
        feature, score = data
        
        if torch.cuda.is_available:
            feature, score = Variable(feature).cuda(), Variable(score).cuda()
        else:
            feature, score = Variable(feature), Variable(score)
        feature, score = feature.float(), score.float()
        out = model(feature)
        loss = loss_function(out, score)
        print(out, score)
        break
        train_loss += loss.data
        train_acc += (torch.abs(out - score) <=1).sum().item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i%20 == 0:
            print("times is {}".format(time.time() - last_time))
            print("[{}/{}]: loss is {}, acc is {}".format(epoch + 1, init_epochs,
                                                          train_loss, train_acc*100 / len(train_dataset)))
            print("*"*20)
    break    


********************
tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
    

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

fig, ax = plt.subplots()
xdata, ydata = [], []
ln, = plt.plot([], [], 'ro', animated=True)

def init():
    ax.set_xlim(0, 100)
    ax.set_ylim(-1, 1)
    return ln,

def update(frame):
    xdata.append(frame)
    ydata.append(np.sin(frame))
    ln.set_data(xdata, ydata)
    return ln,

ani = FuncAnimation(fig, update, frames=np.linspace(0, 100, 10000),interval=0,
                    init_func=init, blit=True)
plt.show()