In [199]:
import pandas as pd
from pandas.tseries.offsets import *
from sklearn import preprocessing
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import math
from copy import deepcopy
import json
from itertools import combinations, permutations
import warnings

% matplotlib inline
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 500)
warnings.filterwarnings('ignore')

In [200]:
# 以这个list pick出需要的东西
# 分层抽样，实际结果的5到0星的教师都有，随机抽取
# 0分的只取一个
pick_list = [871, 279, 1271, 191, 1907, 556, 1569, 624, 741, 1826, 100]

In [201]:
# 这里读取几张原表，pick上面的list里面需要的参数，然后对老师的id进行修改，隐藏相关信息
# 审查学生评论的部分
# 教师id改成int，然后sort，然后顺序替换，确保数据对得上
path = './FA_Model_data/'
df_teacher_monitoring = pd.read_csv(path + 'awj_teacher_monitoring.csv', encoding='utf-8', sep=',')
df_qc = pd.read_csv(path + '老师qc明细.csv', encoding='utf-8', sep=',')
df_teacher_behavior = pd.read_csv(path + '老师行为信息明细.csv', sep=',', encoding='utf-8')
df_teacher_info = pd.read_csv(path + '老师基本信息.csv', sep=',', encoding='utf-8')
df_stu_comment = pd.read_csv(path + '学生评价明细.csv', sep=',', encoding='utf-8')

In [202]:
# 学生的comment（唯一会暴露教师姓名个人信息的部分，完全可以不要，到时候drop掉这个comment列）
df_teacher_monitoring = df_teacher_monitoring[df_teacher_monitoring.awj_teacher_id.isin(pick_list)]
df_teacher_monitoring.drop("comment", axis=1, inplace=True)

In [203]:
# 评论，这张表唯一一个会暴露教师信息的地方，也根本没被用到过，因此这个也是可以不被需要的
# 机构那一列全部改成一个统一的字符串就行：例如：community
df_stu_comment = df_stu_comment[df_stu_comment.awj_teacher_id.isin(pick_list)]
df_stu_comment.drop("评论", axis=1, inplace=True)
df_stu_comment.drop("标签内容", axis=1, inplace=True)
df_stu_comment["机构"] = "com"

In [204]:
# qc分数
# 有很多没用的列，全部drop，节省空间
df_qc = df_qc[df_qc.awj_teacher_id.isin(pick_list)]
df_qc.drop("teacher_name", axis=1, inplace=True)
df_qc.drop("notes", axis=1, inplace=True)
df_qc.drop("qcer", axis=1, inplace=True)
df_qc.drop("course_name", axis=1, inplace=True)

In [205]:
# teacher_behavior
# 机构那一列全部改成一个统一的字符串就行：例如：community
df_teacher_behavior = df_teacher_behavior[df_teacher_behavior.awj_teacher_id.isin(pick_list)]
df_teacher_behavior.drop("教材", axis=1, inplace=True)
df_teacher_behavior["机构"] = "com"

In [206]:
# teacher_info
df_teacher_info = df_teacher_info[df_teacher_info.awj_teacher_id.isin(pick_list)]
df_teacher_info.drop(["country", "timezone", "degree_and_university", "child_exp"], axis=1, inplace=True)

In [207]:
# 下面就是伪造测试数据的部分

In [208]:
# 函数已经测试通过
# 把teacher_id按照pick_list里面的顺序分别分配0到10的假代号给他们
def process_teacher_id(pick_list, df):
    n = 0
    df_processed = pd.DataFrame()
    for i in pick_list:
        df_tmp = df[df["awj_teacher_id"] == i]
        df_tmp["awj_teacher_id"] = n
        df_processed = pd.concat([df_processed, df_tmp])
        n += 1
    return df_processed

In [209]:
# 把所有的id整理好
df_teacher_monitoring = process_teacher_id(pick_list, df_teacher_monitoring)
df_qc = process_teacher_id(pick_list, df_qc)
df_teacher_behavior = process_teacher_id(pick_list, df_teacher_behavior)
df_teacher_info = process_teacher_id(pick_list, df_teacher_info)
df_stu_comment = process_teacher_id(pick_list, df_stu_comment)

In [210]:
# 整理一下index
df_teacher_monitoring.reset_index(drop=True, inplace=True)
df_qc.reset_index(drop=True, inplace=True)
df_teacher_behavior.reset_index(drop=True, inplace=True)
df_teacher_info.reset_index(drop=True, inplace=True)
df_stu_comment.reset_index(drop=True, inplace=True)

In [211]:
# 教师的编号伪装完成，需要的已经全部去掉

In [212]:
t_path = "./FA_Model_test/"
df_teacher_monitoring.to_csv(t_path + 'awj_teacher_monitoring.csv', encoding='utf-8', sep=',', index=False)
df_qc.to_csv(t_path + '老师qc明细.csv', encoding='utf-8', sep=',', index=False)
df_teacher_behavior.to_csv(t_path + '老师行为信息明细.csv', sep=',', encoding='utf-8', index=False)
df_teacher_info.to_csv(t_path + '老师基本信息.csv', sep=',', encoding='utf-8', index=False)
df_stu_comment.to_csv(t_path + '学生评价明细.csv', sep=',', encoding='utf-8', index=False)

In [213]:
def teacher_qc_score(start_time, end_time, df_qc, class_type_name_special, hq_name_special):
    # 筛选条件
    df_qc = df_qc.loc[(~df_qc['class_type_name'].isin(class_type_name_special)) & 
                    (~df_qc['hq_name'].isin(hq_name_special)) & 
                    (df_qc['start_time'] >= start_time) & (df_qc['end_time'] <= end_time)]
    # log score-mean
    df_qc['decay_index'] = ((end_time - df_qc['score_recorded_at']).dt.total_seconds()) / (3600 * 24) 
    df_qc.loc[df_qc['decay_index'] < 0, 'decay_index'] = 0 # qc时间有时会在end_time之后造成负值
    df_qc['decay_index'] = df_qc['decay_index'] + 2 # log计算，防止0值+1;取倒数+1，防止无穷情况
    df_qc['decay_index'] = df_qc['decay_index'].apply(lambda x: math.log(x, 10))
    df_qc['decay_index'] = 1 / df_qc['decay_index']
    df_qc['log_decay_score'] = df_qc['score'] * df_qc['decay_index']
    df_teacher_log_score = df_qc.groupby(['awj_teacher_id'], 
                        as_index=False).agg({'log_decay_score': np.sum, 'decay_index': np.sum})
    df_teacher_log_score['log_decay_score_mean'] = df_teacher_log_score['log_decay_score'] / df_teacher_log_score['decay_index']
    df_teacher_log_score = df_teacher_log_score[['awj_teacher_id', 'log_decay_score_mean']]
    # score
    df_teacher_score = df_qc.groupby(['awj_teacher_id'], 
                as_index=False)['score'].agg(['max', 'min', 'count', np.std]) 
    # std missing
    df_teacher_score.loc[df_teacher_score['std'].isnull(), 'std'] = df_teacher_score['std'].mean()
    df_teacher_score.rename(columns={'max': 'teacher_score_max', 'min': 'teacher_score_min', 
                        'count': 'teacher_qc_count', 'std': 'teacher_score_std'}, inplace=True)
    df_teacher_score.reset_index(inplace=True)
    # merge
    df_qc_res = pd.merge(df_teacher_score, df_teacher_log_score, on='awj_teacher_id', how='left')
    return df_qc_res

In [214]:
def teacher_behavior(start_time, end_time, df_teacher_behavior, 
                                class_type_name_special, hq_name_special):
    # 筛选条件
    df_teacher_behavior = df_teacher_behavior.loc[
                    (~df_teacher_behavior['上课类型'].isin(class_type_name_special)) & 
                    (~df_teacher_behavior['机构'].isin(hq_name_special)) & 
                    (df_teacher_behavior['start_time'] >= start_time) & 
                    (df_teacher_behavior['end_time'] <= end_time) & 
                    (~df_teacher_behavior['teacher_status_for_lesson'].isin(['system_failure']))]
    # 预处理
    df_teacher_behavior['lesson_count'] = ((df_teacher_behavior['end_time'] - 
                        df_teacher_behavior['start_time']).dt.total_seconds()) / 3600 * 2
    df_teacher_behavior.loc[df_teacher_behavior['teacher_status_for_lesson'] == 'no_show', 
                            'ask_for_leave_advanced_minutes'] = 0
    df_teacher_behavior['ask_for_leave_advanced_days'] = df_teacher_behavior['ask_for_leave_advanced_minutes'] / (60 * 24)
    # 天数衰减
    df_teacher_behavior['decay_index'] = ((end_time - 
            df_teacher_behavior['start_time']).dt.total_seconds()) / (3600 * 24) 
    df_teacher_behavior.loc[df_teacher_behavior['decay_index'] < 0, 'decay_index'] = 0
    # log计算，防止0值+1;取倒数+1，防止无穷情况
    df_teacher_behavior['decay_index'] = df_teacher_behavior['decay_index'] + 2
    df_teacher_behavior['decay_index'] = df_teacher_behavior['decay_index'].apply(lambda x: math.log(x, 10))
    df_teacher_behavior['decay_index'] = 1 / df_teacher_behavior['decay_index']
    # count lesson types
    df_teacher_behavior_res = df_teacher_behavior[['awj_teacher_id']]
    df_teacher_behavior_res.drop_duplicates(keep='first', inplace=True)
    types = ['normal_lesson', 'late', 'no_show', 'abnormal_lesson', 'ask_for_leave']
    for itm in types:
        df_count = df_teacher_behavior.loc[
            df_teacher_behavior['teacher_status_for_lesson'].isin([itm])]
        df_count['log_lesson_count'] = df_count['lesson_count'] * df_count['decay_index']
        df_count = df_count.groupby(['awj_teacher_id'], as_index=False)['log_lesson_count'].sum()
        df_count.reset_index()
        if (itm.find('ask_for_leave') > -1) | (itm.find('lesson') > -1):
            df_count.rename(columns={'log_lesson_count': itm + '_log_count'}, inplace=True)
        else:
            df_count.rename(columns={'log_lesson_count': itm + '_lesson_log_count'}, inplace=True)
        df_teacher_behavior_res = pd.merge(df_teacher_behavior_res, df_count, 
                                                  on='awj_teacher_id', how='left')
    # ask for leave advanced minutes
    df_advanced_days = df_teacher_behavior.loc[
        df_teacher_behavior['teacher_status_for_lesson'].isin(['ask_for_leave', 'no_show'])]  
    df_advanced_days['ask_for_leave_advanced_log_days'] = df_advanced_days['decay_index'] * df_advanced_days['ask_for_leave_advanced_days']
    # 求均值
    df_advanced_days_mean = df_advanced_days.groupby(['awj_teacher_id'], as_index=False).agg(
        {'ask_for_leave_advanced_log_days': np.sum, 'decay_index': np.sum})
    df_advanced_days_mean.reset_index(inplace=True)
    df_advanced_days_mean['advanced_days_log_mean'] = df_advanced_days_mean['ask_for_leave_advanced_log_days'] / df_advanced_days_mean['decay_index'] 
    df_advanced_days_mean = df_advanced_days_mean[['awj_teacher_id', 'advanced_days_log_mean']]
    # 求最大最小值及方差
    df_advanced_days_others = df_advanced_days.groupby(['awj_teacher_id'], 
            as_index=False)['ask_for_leave_advanced_days'].agg(['min', 'max', 'std'])
    df_advanced_days_others.reset_index(inplace=True)
    df_advanced_days_others.loc[df_advanced_days_others['std'].isnull(), 
                                'std'] = df_advanced_days_others['std'].mean()
    df_advanced_days_others.rename(columns={'min': 'advanced_days_min', 
                    'max': 'advanced_days_max', 'std': 'advanced_days_std'}, inplace=True)
    df_advanced_days = pd.merge(df_advanced_days_mean, df_advanced_days_others, 
                               on='awj_teacher_id', how='left')
    df_teacher_behavior_res = pd.merge(df_teacher_behavior_res, df_advanced_days, 
                                                       on='awj_teacher_id', how='left')
    # fill 0
    columns = ['normal_lesson_log_count', 'late_lesson_log_count', 'no_show_lesson_log_count', 
               'abnormal_lesson_log_count', 'ask_for_leave_log_count', 'advanced_days_log_mean', 
               'advanced_days_min', 'advanced_days_max', 'advanced_days_std']
    for itm in columns:
        df_teacher_behavior_res[itm].fillna(value=0, inplace=True)
    return df_teacher_behavior_res, df_teacher_behavior

In [215]:
def stu_comment(start_time, end_time, df_stu_comment, 
                class_type_name_special, hq_name_special, df_teacher_behavior_processed_cache):
    # 筛选条件：时间筛选条件由于筛选后数量过少，所以采用end time前的全量数据
    df_stu_comment = df_stu_comment.loc[(df_stu_comment['课程开始时间'] >= start_time) & 
                                (df_stu_comment['课程结束时间'] <= end_time) & 
                                (~df_stu_comment['机构'].isin(hq_name_special)) & 
                                (~df_stu_comment['课程类型'].isin(class_type_name_special))]
    print('df_stu_comment_floater classes existed:', df_stu_comment.shape)
    # 与行为表对比，去除floater课程
    df_stu_comment = pd.merge(df_stu_comment, df_teacher_behavior_processed_cache, 
                             left_on=['awj_teacher_id', 'awjcls_lesson_id'], 
                             right_on=['awj_teacher_id', 'awj_lesson_id'], how='left')
    df_stu_comment = df_stu_comment.loc[~df_stu_comment['is_deleted'].isnull()]
    df_stu_comment.drop(['awj_lesson_id', 'is_deleted'], axis=1, inplace=True)
    print('df_stu_comment_floater classes deleted:', df_stu_comment.shape)
    # 天数衰减
    df_stu_comment['decay_index'] = ((end_time - 
            df_stu_comment['评价时间']).dt.total_seconds()) / (3600 * 24) 
    df_stu_comment.loc[df_stu_comment['decay_index'] < 0, 'decay_index'] = 0
    # log计算，防止0值+1;取倒数+1，防止无穷情况
    df_stu_comment['decay_index'] = df_stu_comment['decay_index'] + 2
    df_stu_comment['decay_index'] = df_stu_comment['decay_index'].apply(lambda x: math.log(x, 10))
    df_stu_comment['decay_index'] = 1 / df_stu_comment['decay_index']  
    # 4，5星统计(4，5星负面标签都是系统bug)
    df_good_label = df_stu_comment.loc[df_stu_comment['学生评价星级'].isin(['5-star', '4-star'])]
    df_good_label['lesson_count_processed'] = df_good_label['lesson_count'] * df_good_label['decay_index']
    df_good_label = df_good_label.groupby(['awj_teacher_id'], 
                            as_index=False)['lesson_count_processed'].sum()
    df_good_label.rename(columns={'lesson_count_processed': 'stu_comment_log_good_behavior'}, inplace=True)
    # 1~3星统计
    df_bad_label = df_stu_comment.loc[~df_stu_comment['学生评价星级'].isin(['5-star', '4-star'])]
    df_bad_label['lesson_count_processed'] = df_bad_label['lesson_count'] * df_bad_label['decay_index']
    df_bad_label = df_bad_label.groupby(['awj_teacher_id'], 
                                as_index=False)['lesson_count_processed'].sum()
    df_bad_label.rename(columns={'lesson_count_processed': 'stu_comment_log_bad_behavior'}, inplace=True)
    # merge
    df_stu_comment_res = pd.merge(df_good_label, df_bad_label, on='awj_teacher_id', how='outer')
    df_stu_comment_res['stu_comment_log_bad_behavior'].fillna(value=0, inplace=True)
    df_stu_comment_res['stu_comment_log_good_behavior'].fillna(value=0, inplace=True)
    return df_stu_comment_res, df_stu_comment

In [216]:
#     end_time = pd.to_datetime(datetime.datetime.now().date()) + MonthEnd(n=-1) + DateOffset(hours=23.99999)
#     start_time = end_time - DateOffset(months=3)
end_time = datetime.datetime(2018, 4, 30, 23, 59, 59)
start_time = datetime.datetime(2018, 1, 30, 23, 59, 59)
class_type_name_special = ['Demo', '补课(非爱乐奇直属老师)', '托福班（30刀）', 'TOFEL', 
                          'VIP Writing/TOFEL（35刀）', 'Elite Pilot', 'Feeback Session', 
                          'New Teacher Test Class', '补课(爱乐奇直属老师)', 'Test Class', 
                          'Academic Meeting (Long)', 'Cur Experience Session-S', 
                          'Training-receiving', 'Cur Experience Session-L', 
                          'VIP Writing/TOFEL', 'Orientation Class', 'Academic Meeting', 
                          'Experience-receiving']
hq_name_special = ['test']
print('start_time:', start_time, '\n', 'end_time:', end_time, '\n')






##### 老师监控表 #####
# dtypes
df_teacher_monitoring['awj_teacher_id'] = df_teacher_monitoring['awj_teacher_id'].astype('int')
df_teacher_monitoring['created_at'] = pd.to_datetime(df_teacher_monitoring['created_at'])
# drop duplicates
df_teacher_monitoring.drop_duplicates(
                    subset=list(df_teacher_monitoring.columns), keep='first', inplace=True)
# sort
df_teacher_monitoring = df_teacher_monitoring.sort_values(
                by=['awj_teacher_id', 'awjcls_lesson_id', 'created_at'], ascending=[1, 1, 1])
# 该表只取abnormal_type为4，5的行，分别表示zoom崩溃和课件崩溃，非老师原因
df_teacher_monitoring = df_teacher_monitoring.loc[
                df_teacher_monitoring['abnormal_type'].isin([4, 5])]
df_teacher_monitoring = df_teacher_monitoring.groupby(
            ['awj_teacher_id', 'awjcls_lesson_id', 'abnormal_type'], as_index=False).last()


#     df_teacher_monitoring.to_csv("教师监控_res.csv", index=False)








##### 老师QC明细表 #####
# dtypes
df_qc['awj_teacher_id'] = df_qc['awj_teacher_id'].astype('int')
df_qc['score_recorded_at'] = pd.to_datetime(df_qc['score_recorded_at'])
df_qc['assigned_at'] = pd.to_datetime(df_qc['assigned_at'])
df_qc['start_time'] = pd.to_datetime(df_qc['start_time'])
df_qc['end_time'] = pd.to_datetime(df_qc['end_time'])
# drop duplicates-老师同一堂课有时会有多次qc，check后发现分数都一样，所以去重时按照以下字段去重即可
columns = ['awj_teacher_id', 'awjcls_lesson_id', 'score']
df_qc.drop_duplicates(subset=columns, keep='last', inplace=True)
# sort
df_qc = df_qc.sort_values(by=['awj_teacher_id', 'start_time'], ascending=[1, 1])

#     df_qc.to_csv("教师qc_res.csv", index=False)







##### 老师行为表 #####
# dtypes
df_teacher_behavior['awj_teacher_id'] = df_teacher_behavior['awj_teacher_id'].astype('int')
df_teacher_behavior['start_time'] = pd.to_datetime(df_teacher_behavior['start_time'])
df_teacher_behavior['end_time'] = pd.to_datetime(df_teacher_behavior['end_time'])
df_teacher_behavior['actual_start_time'] = pd.to_datetime(df_teacher_behavior['actual_start_time'])
df_teacher_behavior['actual_end_time'] = pd.to_datetime(df_teacher_behavior['actual_end_time'])
df_teacher_behavior.drop_duplicates(subset=list(df_teacher_behavior.columns), inplace=True)
# sort
# 原本这里是“积分变化”，sort的最后一个字段，改成了teacher_status_for_lesson
df_teacher_behavior = df_teacher_behavior.sort_values(by=[
        'awj_teacher_id', 'awj_lesson_id', 'start_time', 'teacher_status_for_lesson'], ascending=[1, 1, 1, 1])
# 老师id加lesson_id应该唯一，但有时有重复情况，因为有些老师先请了假，后来又来上课了，所以请假应该去除
df_teacher_behavior = df_teacher_behavior.groupby(
            ['awj_teacher_id', 'awj_lesson_id'], as_index=False).last()
# 与monitoring表对比，去除老师abnormal_lesson细分为4，5状态下的课程记录
df_teacher_behavior = pd.merge(df_teacher_behavior, 
                df_teacher_monitoring[['awj_teacher_id', 'awjcls_lesson_id', 'abnormal_type']], 
                left_on=['awj_teacher_id', 'awj_lesson_id'], 
                right_on=['awj_teacher_id', 'awjcls_lesson_id'], how='left')
index = df_teacher_behavior.loc[
                (df_teacher_behavior['teacher_status_for_lesson'] == 'abnormal_lesson') & 
                (df_teacher_behavior['abnormal_type'].isin([4, 5]))].index
df_teacher_behavior = df_teacher_behavior.loc[~df_teacher_behavior.index.isin(list(index))]
df_teacher_behavior.drop(['awjcls_lesson_id', 'abnormal_type'], axis=1, inplace=True)

#     df_teacher_behavior.to_csv("教师行为_res.csv", index=False)







##### 老师信息表 #####
# dtypes
df_teacher_info['awj_teacher_id'] = df_teacher_info['awj_teacher_id'].astype(int)
df_teacher_info['创建时间'] = pd.to_datetime(df_teacher_info['创建时间'])
df_teacher_info['首次上架时间'] = pd.to_datetime(df_teacher_info['首次上架时间'])
df_teacher_info['首课时间'] = pd.to_datetime(df_teacher_info['首课时间'])
df_teacher_info.drop_duplicates(
    subset=list(df_teacher_info.columns), keep='first', inplace=True)
# 根据业务要求只取某些type类型老师，其他去除
df_teacher_info = df_teacher_info.loc[df_teacher_info['teacher_type'].isin([
            'booking&arrangement', 'arrangement_only', 'booking_only'])]
# sort
df_teacher_info = df_teacher_info.sort_values(by=['awj_teacher_id'], ascending=[1])
df_teacher_info = df_teacher_info[['awj_teacher_id', 'state', '创建时间', 
                                                       '首次上架时间', '首课时间']]


#     df_teacher_info.to_csv("教师信息_res.csv", index=False)







##### 学生评价明细表 #####
# dtypes
df_stu_comment['awj_teacher_id'] = df_stu_comment['awj_teacher_id'].astype(int)
df_stu_comment['学生评价星级'] = df_stu_comment['学生评价星级'].astype('str')
df_stu_comment = df_stu_comment.loc[df_stu_comment['学生评价星级'].isin(['1-star', '2-star', 
                                                        '3-star', '4-star', '5-star'])]
df_stu_comment['评价时间'] = pd.to_datetime(df_stu_comment['评价时间'])
df_stu_comment['课程开始时间'] = pd.to_datetime(df_stu_comment['课程开始时间'])
df_stu_comment['课程结束时间'] = pd.to_datetime(df_stu_comment['课程结束时间'])
df_stu_comment.drop_duplicates(subset=list(df_stu_comment.columns), keep='first', inplace=True)
# sort
df_stu_comment = df_stu_comment.sort_values(by=[
                        'awj_teacher_id', '评价时间'], ascending=[1, 1]) 
#     # 老师一次上两堂课，好评不应只算作1次
#     df_stu_comment['lesson_count'] = ((df_stu_comment['课程结束时间'] - 
#                               df_stu_comment['课程开始时间']).dt.total_seconds()) / 3600 * 2
# 此处仍然算一次，两次效果非常差
df_stu_comment['lesson_count'] = 1
df_stu_comment = df_stu_comment.groupby(['评价id'], as_index=False).last()

#     df_stu_comment.to_csv("学生评价_res.csv", index=False)
    
    
    
    
    
    
    
    
    
    


start_time: 2018-01-30 23:59:59 
 end_time: 2018-04-30 23:59:59 



In [217]:
# 函数调用
df_qc_res = teacher_qc_score(start_time, end_time, df_qc, 
                             class_type_name_special, hq_name_special)

#     df_qc_res.to_csv("df_qc_res.csv", index=False)


df_teacher_behavior_res, df_teacher_behavior_processed = teacher_behavior(
                    start_time, end_time, df_teacher_behavior, 
                    class_type_name_special, hq_name_special)

#     df_teacher_behavior_res.to_csv("df_teacher_behavior_res.csv", index=False)


# 学生评价表中有些课程是floater的课程，需要与行为表课程对比删除（行为表无该情况）
df_teacher_behavior_res_cache, df_teacher_behavior_processed_cache = teacher_behavior(
                    start_time - DateOffset(years=5), end_time, df_teacher_behavior, 
                    class_type_name_special, hq_name_special)


df_teacher_behavior_processed_cache = df_teacher_behavior_processed_cache[[
                                    'awj_teacher_id', 'awj_lesson_id']]
df_teacher_behavior_processed_cache['is_deleted'] = 'no'

#     df_teacher_behavior_processed_cache.to_csv("df_teacher_behavior_cache_res.csv", index=False)    



######
df_stu_comment_res, df_stu_comment_check = stu_comment(start_time, end_time, df_stu_comment, 
                class_type_name_special, hq_name_special, df_teacher_behavior_processed_cache)


#     df_stu_comment_res.to_csv("df_stu_comment_res.csv", index=False)








# 下面还有一个地方对QC的函数进行了调用，因此得到的结果不准确，导致第一次QCtest的输出结果全部是错的

# 宽表
df_wide = pd.merge(df_teacher_info, df_teacher_behavior_res, on='awj_teacher_id', how='left')
df_wide = pd.merge(df_wide, df_qc_res, on='awj_teacher_id', how='left')
df_wide = pd.merge(df_wide, df_stu_comment_res, on='awj_teacher_id', how='left')

#     df_wide.to_csv("df_wide_check.csv", index = False)



# 有些老师3个月内可能无qc分数，所以取历史qc记录处理后填补0分情况
df_qc_all = teacher_qc_score(start_time - DateOffset(years=5), end_time, df_qc, 
                                         class_type_name_special, hq_name_special)
df_qc_all = df_qc_all[['awj_teacher_id', 'teacher_score_max', 
                           'teacher_score_min', 'log_decay_score_mean']]
df_qc_all.rename(columns={'teacher_score_max': 'teacher_score_max_all', 
                     'teacher_score_min': 'teacher_score_min_all', 
                     'log_decay_score_mean': 'log_decay_score_mean_all'}, inplace=True)
df_wide = pd.merge(df_wide, df_qc_all, on='awj_teacher_id', how='left')
# 有些老师没上过课，但请过假，此处为了让这些老师参加评分而非直接三星，所以将normal_lesson_log_count平滑处理
df_wide.loc[(df_wide['normal_lesson_log_count'] == 0) & 
        (df_wide['ask_for_leave_log_count'] > 0), 'normal_lesson_log_count'] = math.log(2, 10)


#     df_wide.to_csv("df_wide_res.csv", index=False)




# 衍生新字段
df_wide['log_ask_for_leave/log_normal_lesson'] = df_wide['ask_for_leave_log_count'] / df_wide['normal_lesson_log_count']
df_wide['abnormal_all_log_count'] = (df_wide['no_show_lesson_log_count'] + 
                                  df_wide['late_lesson_log_count'] + 
                                  df_wide['abnormal_lesson_log_count'] + 
                                  df_wide['ask_for_leave_log_count'])
df_wide['abnormal_all_log_percent'] = (df_wide['abnormal_all_log_count']) / (
                                  df_wide['normal_lesson_log_count'] + 
                                  df_wide['abnormal_all_log_count'])
df_wide['lesson_time_range'] = ((end_time - df_wide['首课时间']).dt.total_seconds()) / (3600 * 24) 
df_wide.loc[df_wide['lesson_time_range'] == 0, 
    'lesson_time_range'] = ((end_time - df_wide['创建时间']).dt.total_seconds()) / (3600 * 24)  


#     df_wide.to_csv("df_wide_final_res.csv", index=False)
    


df_stu_comment_floater classes existed: (759, 11)
df_stu_comment_floater classes deleted: (759, 11)


In [218]:
##  在pycharm里面，因为后面的main要init两个函数，所以引用变成了两次，问题就出在这里
# 见下面
# 这个地方只能执行一次，不然会有key error
# pycharm里面似乎有循环调用的嫌疑，所以报错了
# 判断是否是新老师
df_wide['old_new_teacher'] = 'old'
df_wide.loc[(df_wide['首课时间'].isnull()) & 
            (df_wide['state'].isin(['oboard', 'active'])), 'old_new_teacher'] = 'new'
# 缺失值填补
# 无用字段去除
df_wide.drop(['创建时间', '首次上架时间', '首课时间', 'state'], axis=1, inplace=True)
columns = list(df_wide.columns)
columns.pop(columns.index('awj_teacher_id'))
columns.pop(columns.index('old_new_teacher'))
# new teacher: mean
for itm in columns:
    df_wide.loc[df_wide['old_new_teacher'] == 'new', itm] = df_wide.loc[
        (df_wide['normal_lesson_log_count'] > 0), itm].mean()
# old teacher:0
df_wide.fillna(value=0, inplace=True)
new_teacher = df_wide.loc[df_wide['old_new_teacher'] == 'new', 'awj_teacher_id']
print('new teacher no:', new_teacher.shape[0])
# 有些老师没有请过假，advanced_days字段为0，填为均值
df_wide.loc[(df_wide['ask_for_leave_log_count'] == 0) & (df_wide['normal_lesson_log_count'] > 0), 
    'advanced_days_max'] = df_wide.loc[(df_wide['advanced_days_max'] != 0) & 
                    df_wide['normal_lesson_log_count'] > 0, 'advanced_days_max'].mean()
df_wide.loc[(df_wide['ask_for_leave_log_count'] == 0) & (df_wide['normal_lesson_log_count'] > 0), 
    'advanced_days_log_mean'] = df_wide.loc[(df_wide['advanced_days_log_mean'] != 0) & 
                    df_wide['normal_lesson_log_count'] > 0, 'advanced_days_log_mean'].mean()

# df_wide.to_csv("judge_new_teacher_res.csv", index=False)




# 平滑
# smooth0
smooth = ['late_lesson_log_count', 'no_show_lesson_log_count', 'abnormal_lesson_log_count', 
          'ask_for_leave_log_count', 
          'stu_comment_log_bad_behavior']
for itm in smooth:
    df_wide[itm] = df_wide[itm] + math.log(2, 10)
# 字段处理
# 比例计算
df_wide['late_lesson_log_percent'] = df_wide['late_lesson_log_count'] / df_wide['normal_lesson_log_count']
df_wide['no_show_lesson_log_percent'] = df_wide['no_show_lesson_log_count'] / df_wide['normal_lesson_log_count']
df_wide['abnormal_lesson_log_percent'] = df_wide['abnormal_lesson_log_count'] / df_wide['normal_lesson_log_count']
df_wide['ask_for_leave_log_percent'] = df_wide['ask_for_leave_log_count'] / df_wide['normal_lesson_log_count']

# df_wide.to_csv('smooth_0_res.csv', index=False)



# smooth1
# 老师异常行为有一项出现较大异常值时或整体较差，normal_lesson_log_count降为相应较低数值，整体表现变差
counts = ['ask_for_leave_log_count', 'late_lesson_log_count', 
           'no_show_lesson_log_count', 'abnormal_lesson_log_count', 
           'abnormal_all_log_count']
columns = ['ask_for_leave_log_percent', 'late_lesson_log_percent', 
           'no_show_lesson_log_percent', 'abnormal_lesson_log_percent', 
           'abnormal_all_log_percent']
cache_columns = ['normal_lesson_log_count_processed_ask_for_leave_cahce1', 
                 'normal_lesson_log_count_processed_late_lesson_cache2', 
                 'normal_lesson_log_count_processed_no_show_cache3', 
                 'normal_lesson_log_count_processed_abnormal_lesson_cache4', 
                 'normal_lesson_log_count_processed_abnormal_all_cahce5'
                ]
for itm in cache_columns:
    df_wide[itm] = df_wide['normal_lesson_log_count']
df_wide['normal_lesson_log_count_processed'] = df_wide['normal_lesson_log_count']

# 分位数
quantiles = [[0.6, 0.7, 0.75, 0.8, 1], [0.8, 0.9, 0.92, 0.95, 1], 
            [0.5, 0.6, 0.66, 0.75, 0.8, 0.85, 0.9, 1], [0.8, 0.85, 0.9, 0.95, 1], 
            [0.55, 0.66, 0.75, 0.85, 1]]
# 降低比例
indexes = [[0.8, 0.6, 0.3, 0.1], [0.8, 0.7, 0.3, 0.1], 
          [0.8, 0.75, 0.6, 0.5, 0.3, 0.1, 0.03], [0.9, 0.85, 0.7, 0.3], 
          [0.8, 0.7, 0.3, 0.1]]
for i in range(len(columns)):
    for k in range(len(quantiles[i]) - 1):
        standard1 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                columns[i]].quantile(quantiles[i][k])
        standard2 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                columns[i]].quantile(quantiles[i][k + 1])
        # 降低正常上课的数量
        df_wide.loc[(df_wide['normal_lesson_log_count'] > 0) & 
            (df_wide[columns[i]] > standard1) & (df_wide[columns[i]] <= standard2), 
            cache_columns[i]] = df_wide['normal_lesson_log_count'] * indexes[i][k]
        # 新老师因为平滑原因，所以再复原，不惩罚
        df_wide.loc[(df_wide['normal_lesson_log_count'] > 0) & 
            (df_wide[columns[i]] > standard1) & (df_wide[columns[i]] <= standard2) & 
            (df_wide[counts[i]] == math.log(2, 10)) & 
            (df_wide['normal_lesson_log_count'] <= 8), 
            cache_columns[i]] = df_wide['normal_lesson_log_count']

# df_wide.to_csv('smooth_1_res.csv', index=False)




# smooth_final
# 从5行cache_colume中取最小值
df_wide['normal_lesson_log_count_processed'] = df_wide[cache_columns].min(axis=1)
# 大小方向统一化
# 重新计算四个percent
df_wide['late_lesson_log_percent_processed'] = df_wide['late_lesson_log_count'] / df_wide['normal_lesson_log_count_processed']
df_wide['no_show_lesson_log_percent_processed'] = df_wide['no_show_lesson_log_count'] / df_wide['normal_lesson_log_count_processed']
df_wide['abnormal_lesson_log_percent_processed'] = df_wide['abnormal_lesson_log_count'] / df_wide['normal_lesson_log_count_processed']
df_wide['ask_for_leave_log_percent_processed'] = df_wide['ask_for_leave_log_count'] / df_wide['normal_lesson_log_count_processed']
df_wide['normal_log_lesson_per_week'] = df_wide['normal_lesson_log_count_processed'] / (df_wide['lesson_time_range']) 
# 取倒数
rcp = ['late_lesson_log_processed', 'no_show_lesson_log_processed', 
       'abnormal_lesson_log_processed', 'ask_for_leave_log_processed', 
       'stu_comment_log_bad_behavior_processed']
cols = ['late_lesson_log_percent_processed', 'no_show_lesson_log_percent_processed', 
        'abnormal_lesson_log_percent_processed', 'ask_for_leave_log_percent_processed', 
        'stu_comment_log_bad_behavior']
for i in range(len(rcp)):
    df_wide[rcp[i]] = 1 / df_wide[cols[i]]

# df_wide.to_csv('smooth_final_res.csv', index=False)







# fix_process_0
# 修正老师上课数量少但好评较多的情况（如老师id642）
columns = ['late_lesson_log_count', 'no_show_lesson_log_count', 
          'ask_for_leave_log_count', 'abnormal_lesson_log_count']
no_smooths = ['late_lesson_log_count_no_smooth', 'no_show_lesson_log_count_no_smooth', 
          'ask_for_leave_log_count_no_smooth', 'abnormal_lesson_log_count_no_smooth']
for i in range(len(columns)):
    df_wide[no_smooths[i]] = df_wide[columns[i]]
    df_wide.loc[df_wide[no_smooths[i]] == math.log(2, 10), no_smooths[i]] = 0
df_wide['stu_comment_log_good_behavior_processed'] = df_wide['stu_comment_log_good_behavior'] / ( 
                            df_wide['normal_lesson_log_count']
                            + df_wide['late_lesson_log_count_no_smooth'] 
                            + df_wide['no_show_lesson_log_count_no_smooth'] 
                            + df_wide['ask_for_leave_log_count_no_smooth'] 
                            + df_wide['abnormal_lesson_log_count_no_smooth'])                    
df_wide['stu_comment_log_bad_behavior_processed'] = df_wide['stu_comment_log_bad_behavior'] / ( 
                            df_wide['normal_lesson_log_count'])


# df_wide.to_csv('fix_process_0_res.csv', index=False)







# fix_process_1
# 修正老师请假次数过多，但提前请假天数指标过好的情况（如老师id642） 
columns = ['advanced_days_log_mean', 'advanced_days_max']
quantiles = [0.6, 0.66, 0.7, 0.75, 0.77, 0.8, 0.85, 1]
indexes = [0.9, 0.8, 0.7, 0.5, 0.3, 0.1, 0.05]
for i in range(len(columns)):
    for k in range(len(quantiles) - 1):
        percent1 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                            'log_ask_for_leave/log_normal_lesson'].quantile(quantiles[k])
        percent2 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                            'log_ask_for_leave/log_normal_lesson'].quantile(quantiles[k + 1])
        df_wide.loc[(df_wide['normal_lesson_log_count'] > 0) & 
                    (df_wide['log_ask_for_leave/log_normal_lesson'] > percent1) & 
                    (df_wide['log_ask_for_leave/log_normal_lesson'] <= percent2), 
                    columns[i]] = df_wide[columns[i]] * indexes[k]

# df_wide = df_wide.round(5)
# df_wide.to_csv('fix_process_1_res.csv', index=False)








# fix_process_final
# 修正有些老师过去3个月内无qc，分数为0分的情况
columns = ['log_decay_score_mean', 'teacher_score_min', 'teacher_score_max']
for itm in columns:
    df_wide.loc[(df_wide['normal_lesson_log_count'] > 0) & 
               (df_wide[itm].isin([np.nan, 0])), itm] = df_wide[itm + '_all']
# 无上课记录老师
columns = list(df_wide.columns)
columns.pop(columns.index('awj_teacher_id'))
for itm in columns:
    df_wide.loc[df_wide['normal_lesson_log_count'] == 0, itm] = 0
# 只取有行为数据的
df_wide_final = df_wide.loc[df_wide['normal_lesson_log_count'] > 0]
# get_df_wide_final
# delete columns
df_wide_final.drop(['advanced_days_std', 'teacher_score_std', 
              'advanced_days_min', 'lesson_time_range', 'old_new_teacher', 
              'teacher_qc_count', 'normal_lesson_log_count', 'late_lesson_log_count', 
              'no_show_lesson_log_count', 'abnormal_lesson_log_count', 
              'ask_for_leave_log_count', 'normal_log_lesson_per_week', 
              'stu_comment_log_good_behavior', 'stu_comment_log_bad_behavior', 
              'log_ask_for_leave/log_normal_lesson',  'abnormal_lesson_log_percent',
              'late_lesson_log_percent', 'no_show_lesson_log_percent', 
              'ask_for_leave_log_percent', 'abnormal_all_log_count', 
              'late_lesson_log_percent_processed', 'no_show_lesson_log_percent_processed', 
              'abnormal_lesson_log_percent_processed', 'ask_for_leave_log_percent_processed', 
              'normal_lesson_log_count_processed', 'abnormal_all_log_percent', 
              'normal_lesson_log_count_processed_ask_for_leave_cahce1', 
              'normal_lesson_log_count_processed_late_lesson_cache2', 
              'normal_lesson_log_count_processed_no_show_cache3', 
              'normal_lesson_log_count_processed_abnormal_lesson_cache4', 
              'normal_lesson_log_count_processed_abnormal_all_cahce5',   
              'log_decay_score_mean_all', 'teacher_score_max_all', 'teacher_score_min_all', 
              'stu_comment_log_bad_behavior_processed', 
              'late_lesson_log_count_no_smooth', 'no_show_lesson_log_count_no_smooth', 
              'ask_for_leave_log_count_no_smooth', 'abnormal_lesson_log_count_no_smooth'], axis=1, inplace=True)
# save
# df_wide_final = df_wide_final.round(5)
# df_wide_final.to_csv('gen_df_wide_final_res.csv', index=False)


new teacher no: 0


In [265]:
df_wide_final.columns

Index(['awj_teacher_id', 'advanced_days_log_mean', 'advanced_days_max',
       'teacher_score_max', 'teacher_score_min', 'log_decay_score_mean',
       'late_lesson_log_processed', 'no_show_lesson_log_processed',
       'abnormal_lesson_log_processed', 'ask_for_leave_log_processed',
       'stu_comment_log_good_behavior_processed'],
      dtype='object')

In [219]:
#############################################################

In [220]:
# 因子载荷矩阵

In [221]:
# FA_indexs
fa_index = pd.read_excel(path + 'fa_indexs.xlsx')
fa_index

Unnamed: 0,fa_index1,fa_index2,fa_index3,fa_index4
0,-0.062063,-0.028543,0.530873,0.008978
1,-0.087088,-0.017771,0.545701,-0.019414
2,-0.027714,0.344296,-0.032156,0.028113
3,-0.036995,0.346024,-0.001726,-0.060828
4,-0.032262,0.358875,-0.021149,-0.026614
5,0.294944,-0.023287,-0.051459,-0.005215
6,0.324883,-0.028596,-0.071127,0.020754
7,0.321721,-0.017894,-0.08745,0.007256
8,0.275466,-0.043566,-0.015902,0.034857
9,0.023129,-0.02892,-0.006686,1.000274


In [222]:
fa_index = fa_index.as_matrix()
df_wide_matrix = deepcopy(df_wide_final)
df_wide_matrix.drop(['awj_teacher_id'], axis=1, inplace=True)
df_wide_matrix = df_wide_matrix.as_matrix()
# df_wide标准化
df_wide_matrix = preprocessing.scale(df_wide_matrix)
# 每个老师的各因子得分
fa_score = np.dot(df_wide_matrix, fa_index)
# 主成分贡献率
var = np.array([[0.39025 / 0.87332], 
                [0.23851 / 0.87332], 
                [0.14904 / 0.87332], 
                [0.09552 / 0.87332]])
# 每个老师的最终得分
final_score = np.dot(fa_score, var)
# df格式
teacher_fa_score = np.hstack((fa_score, final_score))
teacher_fa_score = pd.DataFrame(teacher_fa_score)
teacher_fa_score['awj_teacher_id'] = list(df_wide_final['awj_teacher_id'])
teacher_fa_score.rename(columns={0: 'teacher_behavior_score', 1: 'teacher_qc_score', 
                                 2:'teacher_attitude_score', 3: 'student_comment_score', 
                                 4: 'final_score'}, inplace=True)
# 星级映射(去除过去一段时间周期内没上过课的老师)
teacher_fa_score = teacher_fa_score.sort_values(by='final_score', ascending=0)

# teacher_fa_score = teacher_fa_score.round(5)
# teacher_fa_score.to_csv("gen_fa_score_dataframe_res.csv", index=False)

In [224]:
#############################################################

In [225]:

# 业务要求的分位数
star_5 = teacher_fa_score['final_score'].quantile(0.8)
star_4 = teacher_fa_score['final_score'].quantile(0.5)
star_3 = teacher_fa_score['final_score'].quantile(0.2)
star_2 = teacher_fa_score['final_score'].quantile(0.1)
teacher_fa_score.loc[teacher_fa_score['final_score'] <= star_2, 'star'] = 1
teacher_fa_score.loc[(teacher_fa_score['final_score'] > star_2) & 
                     (teacher_fa_score['final_score'] <= star_3), 'star'] = 2
teacher_fa_score.loc[(teacher_fa_score['final_score'] > star_3) & 
                     (teacher_fa_score['final_score'] <= star_4), 'star'] = 3
teacher_fa_score.loc[(teacher_fa_score['final_score'] > star_4) & 
                     (teacher_fa_score['final_score'] <= star_5), 'star'] = 4
teacher_fa_score.loc[teacher_fa_score['final_score'] > star_5, 'star'] = 5

# teacher_fa_score = teacher_fa_score.round(5)
# teacher_fa_score.to_csv("gen_star_rate_res.csv", index=False)

In [226]:
# 拼回去
teacher_fa_score = pd.merge(teacher_fa_score, df_wide[
    ['awj_teacher_id', 'normal_lesson_log_count']], on='awj_teacher_id', how='right')
teacher_fa_score.fillna(value=0, inplace=True)
teacher_fa_score.to_csv(path + 'teacher_star.csv', sep=',', 
                        float_format='%.5f', encoding='utf-8', index=False)

# teacher_fa_score = teacher_fa_score.round(5)
# teacher_fa_score.to_csv("gen_final_table_res.csv", index=False)

In [227]:
################################################################################