In [4]:
# 导入数据集
# 数据集 -> 右边图标点击 -> 插入代码


# 全量运行之后，获取训练完成的模型文件或者是中间文件的主要流程：
# 上传模型文件 -> 申报审核文件 -> 审核通过 ->下载文件

# 上传模型文件接口示例：
# import wfio
# server_path是上传到服务器的文件名称，可以自定义；local_path是本地上传的文件路径, 必须是本地已经存在的文件路径。
# wfio.upload_to_oss(server_path,local_path)

In [2]:
import pandas as pd 
import numpy as np 

CHUNK_SIZE = 1000
TRAIN_PARTIAL = 0.3
EPOCH = 5

df = pd.read_csv("ten_million_top1k.csv") # 这行在运行环境时候应该注释掉

# 然后得计算一些变量
length = len(df)
iter_size = int(length / CHUNK_SIZE)

In [3]:
from sklearn import metrics

def get_metric(y_test, y_pred):
    # 设计成 dict 比较好加新的 metric，要不然还得改返回值
    metrics_dict = {} 
    metrics_dict['accuracy'] = metrics.accuracy_score(y_test, y_pred)
    metrics_dict['quadratic_weighted_kappa'] = metrics.cohen_kappa_score(y_test, y_pred, weights= 'quadratic')
    return metrics_dict

def display_metrics(metrics_dict):
    metric_list = list(metrics_dict.keys())
    data_list  = []
    for m in metric_list:
        data_list.append([m, metrics_dict[m]])
    return pd.DataFrame(data_list, columns= ['Evaluation Metric', "Value"])

def get_partial_data(input_X, ratio = 0.7):
    length = len(input_X)
    return input_X[:int(length * ratio)], input_X[int(length * ratio):]



In [4]:
# 然后开始训练模型
from sklearn.linear_model import SGDClassifier
import copy 

model = SGDClassifier()

for i in range(iter_size):
    st_index = i * CHUNK_SIZE
    ed_index = st_index + CHUNK_SIZE
    df_tmp   = df[st_index:ed_index]

for i in range(EPOCH):
    print('Start Epoch: ' + str(i))
    for i in range(iter_size):
        st_index = i * CHUNK_SIZE
        ed_index = st_index + CHUNK_SIZE
        df_tmp   = copy.copy(df[st_index:ed_index])
        # 取数据
        y = df_tmp['y']
        df_tmp.drop(["y"], axis = 1, inplace = True)
        X = df_tmp.to_numpy()
        X_train = get_partial_data(X)[0]
        y_train = get_partial_data(y)[0]
        # 做增量学习
        model.partial_fit(X,y, classes = np.unique(y))

print('Training -> Done.')

Start Epoch: 0
Start Epoch: 1
Start Epoch: 2
Start Epoch: 3
Start Epoch: 4
Training -> Done.


In [5]:
# 训练完了就是预测过程

##  这个不怎么占内存
y_test = []
y_pred = []

## 然后具体进行预测
#df_iterator = pd.read_csv('one_million.csv', chunksize= CHUNK_SIZE)
for i in range(iter_size):
    st_index = i * CHUNK_SIZE
    ed_index = st_index + CHUNK_SIZE
    df_tmp   = copy.copy(df[st_index:ed_index])
    # 取数据
    y = df_tmp['y']
    df_tmp.drop(["y"], axis = 1, inplace = True)
    X = df_tmp.to_numpy()
    X_test_part = get_partial_data(X)[0]
    y_test_part = get_partial_data(y)[0]
    y_pred_part = model.predict(X_test_part)
    # 保存结果
    y_test.extend(y_test_part)
    y_pred.extend(y_pred_part)


In [8]:
from sklearn.metrics import classification_report
classification_report(y_true=y_test, y_pred=y_pred)

'              precision    recall  f1-score   support\n\n           0       0.97      0.91      0.94       350\n           1       0.92      0.97      0.94       350\n\n    accuracy                           0.94       700\n   macro avg       0.94      0.94      0.94       700\nweighted avg       0.94      0.94      0.94       700\n'

In [6]:
print('Done!')

Done!
