In [1]:
import pandas as pd
import numpy as np
import random
from random import randint
from faker import Faker
import json
import matplotlib.pyplot as plt

pd.set_option('max_colwidth',150) # 设置最大宽度

In [2]:
def split_row(df, column):
    '''
    df:原始数据
    column:需要拆分的列
    
    retun:行转多行的df
    '''
    row_len = list(map(len, df[column].values))
    rows = []
    for i in df.columns:
        if i == column:
            row = np.concatenate(df[i].values)
        else:
            row = np.repeat(df[i].values, row_len)
        rows.append(row)
    return pd.DataFrame(np.dstack(tuple(rows))[0], columns=df.columns)

In [3]:
# 构造用户活跃数据 从500个数据中不放回采样100个

uids = [i+1 for i in range(500)]
dts = pd.date_range('2021-01-01', '2021-12-31').strftime("%Y-%m-%d").to_list() # 日期范围

active_uids=[]
active_dt=[]
np.random.seed(0)
for d in dts:
    active_dt.append(d)
    active_uids.append(np.random.choice(uids, size=100, replace=False)) # 不放回采样
    
raw_data= pd.DataFrame({'dt':active_dt,
                        'active_uids':active_uids,
                       })

df_user_active = split_row(raw_data, column='active_uids')
df_user_active.rename(columns={'active_uids':'uid'}, inplace=True)

df_user_active.to_csv('user_active.csv', header=None, index=None)
df_user_active.sample(5, random_state=0)

Unnamed: 0,dt,uid
33348,2021-11-30,245
7302,2021-03-15,143
13859,2021-05-19,481
1273,2021-01-13,334
15537,2021-06-05,342


In [4]:
# 生成练习数据 假设练习在每天是稳定的，服从均匀分布[30,50]
np.random.seed(0)
day_ex_cnt = np.random.uniform(30, 50, len(active_dt)).astype(int)
# 假设练习试卷只有100套
paperids = [i+1 for i in range(100)]
# 构造用户练习数据

ex_dt = []
ex_uids = []
i = 0
np.random.seed(0)
for d,e in zip(active_dt,day_ex_cnt):
    ex_dt.append(d)
    ex_uids.append(np.random.choice(active_uids[i], size=e)) # 有放回采样
    i+=i
    
raw_data= pd.DataFrame({'dt':ex_dt,
                        'ex_uids':ex_uids,
                       })

df_user_exercise = split_row(raw_data, column='ex_uids')
df_user_exercise['exerciesid'] = [i+1 for i in range(df_user_exercise.shape[0])]
np.random.seed(10)
df_user_exercise['paperid'] = np.random.choice(paperids, size=df_user_exercise.shape[0])
df_user_exercise.rename(columns={'ex_uids':'uid',
                              'exerciesid':'id',}, inplace=True)

df_user_exercise.to_csv('user_exercise.csv', header=None, index=None)
df_user_exercise.sample(5, random_state=0)

Unnamed: 0,dt,uid,id,paperid
663,2021-01-17,236,664,15
3019,2021-03-18,492,3020,2
13288,2021-12-04,38,13289,45
11952,2021-10-29,109,11953,29
9770,2021-09-06,180,9771,6


In [5]:
# 构造题库，假设有1000道题
np.random.seed(0)
questionids = [i+1 for i in range(100)]
courses = ['数学', '语文', '英语']

# 构造试卷信息，假设每套试卷含有20道题。
np.random.seed(0)
paper_qids=[]
for i in paperids:
    paper_qids.append(np.random.choice(questionids, size=20, replace=False)) # 不放回采样


# 题目
df_paper = pd.DataFrame({'id':paperids,
                        'paper_qids':paper_qids
                       })
# 科目
np.random.seed(0)
df_paper['course'] = np.random.choice(courses, size=df_paper.shape[0])

# 内容
df_paper_content = pd.read_csv('paper_content.csv')
df_paper['content'] = df_paper_content['content'].copy()

# 将试卷表转为json
dict_init={}
for index, row in df_paper.iterrows():
    dict_init['questionids']=','.join([str(x) for x in row['paper_qids']])
    dict_init['course']=row['course']
    dict_init['content']=row['content']
    df_paper.at[index, 'paper_info_temp']=json.dumps(dict_init,ensure_ascii=False)
df_paper['paper_info']=df_paper['paper_info_temp'].map(lambda x: '['+x+']')
df_paper.drop(['paper_qids', 'course', 'content', 'paper_info_temp'], axis=1, inplace=True)

df_paper.to_csv('paper.csv', header=None, index=None, encoding='utf-8')
df_paper.head()

Unnamed: 0,id,paper_info
0,1,"[{""questionids"": ""27,87,3,56,76,94,17,74,55,96,54,93,79,14,8,31,23,25,34,9"", ""course"": ""数学"", ""content"": ""<p><img src=17f987e258771c2.png></p><p><i..."
1,2,"[{""questionids"": ""19,30,65,93,73,88,6,16,13,18,62,77,10,79,81,8,34,7,38,75"", ""course"": ""语文"", ""content"": ""<p>3000円です。 計算方法は、添付の図を参考にしてください。</p><..."
2,3,"[{""questionids"": ""43,97,59,15,73,5,48,65,58,64,7,39,14,60,67,72,16,23,20,13"", ""course"": ""数学"", ""content"": ""<p>下の画像の通りです。</p><p><img src=17fab9c68dc..."
3,4,"[{""questionids"": ""55,85,34,54,3,50,49,83,42,67,30,38,70,51,98,60,69,76,71,53"", ""course"": ""语文"", ""content"": ""<p><img src=17f9b2a9769337b.jpeg></p><p..."
4,5,"[{""questionids"": ""90,57,38,47,3,91,14,16,59,68,17,18,65,39,41,15,12,63,49,46"", ""course"": ""语文"", ""content"": ""<p>これでどうでしょうか？</p><p><img src=17faa6e7c..."
