In [1]:
import pandas as pd
from pandas.tseries.offsets import *
from sklearn import preprocessing
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import math
from copy import deepcopy
import json
from itertools import combinations, permutations
import warnings
warnings.filterwarnings("ignore")  # 大量关于pandas函数版本的warnings没有意义，选择屏蔽



% matplotlib inline
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 500)

In [2]:
def teacher_qc_score(start_time, end_time, df_qc, class_type_name_special, hq_name_special):
    # 筛选条件
    df_qc = df_qc.loc[(~df_qc['class_type_name'].isin(class_type_name_special)) & 
                    (~df_qc['hq_name'].isin(hq_name_special)) & 
                    (df_qc['start_time'] >= start_time) & (df_qc['end_time'] <= end_time)]
    # log score-mean
    df_qc['decay_index'] = ((end_time - df_qc['score_recorded_at']).dt.total_seconds()) / (3600 * 24) 
    df_qc.loc[df_qc['decay_index'] < 0, 'decay_index'] = 0 # qc时间有时会在end_time之后造成负值
    df_qc['decay_index'] = df_qc['decay_index'] + 2 # log计算，防止0值+1;取倒数+1，防止无穷情况
    df_qc['decay_index'] = df_qc['decay_index'].apply(lambda x: math.log(x, 10))
    df_qc['decay_index'] = 1 / df_qc['decay_index']
    df_qc['log_decay_score'] = df_qc['score'] * df_qc['decay_index']
    
    
    
    
    
    
    
    df_teacher_log_score = df_qc.groupby(['awj_teacher_id'], 
                        as_index=False).agg({'log_decay_score': np.sum, 'decay_index': np.sum})
    df_teacher_log_score['log_decay_score_mean'] = df_teacher_log_score['log_decay_score'] / df_teacher_log_score['decay_index']
    df_teacher_log_score = df_teacher_log_score[['awj_teacher_id', 'log_decay_score_mean']]
    # score
    df_teacher_score = df_qc.groupby(['awj_teacher_id'], 
                as_index=False)['score'].agg(['max', 'min', 'count', np.std]) 
    # std missing
    df_teacher_score.loc[df_teacher_score['std'].isnull(), 'std'] = df_teacher_score['std'].mean()
    df_teacher_score.rename(columns={'max': 'teacher_score_max', 'min': 'teacher_score_min', 
                        'count': 'teacher_qc_count', 'std': 'teacher_score_std'}, inplace=True)
    df_teacher_score.reset_index(inplace=True)
    # merge
    df_qc_res = pd.merge(df_teacher_score, df_teacher_log_score, on='awj_teacher_id', how='left')
    return df_qc_res

In [3]:
def teacher_behavior(start_time, end_time, df_teacher_behavior, 
                         class_type_name_special, hq_name_special):
    # 筛选条件
    df_teacher_behavior = df_teacher_behavior.loc[
                    (~df_teacher_behavior['上课类型'].isin(class_type_name_special)) & 
                    (~df_teacher_behavior['机构'].isin(hq_name_special)) & 
                    (df_teacher_behavior['start_time'] >= start_time) & 
                    (df_teacher_behavior['end_time'] <= end_time) & 
                    (~df_teacher_behavior['teacher_status_for_lesson'].isin(['system_failure']))]
    # 预处理
    df_teacher_behavior['lesson_count'] = ((df_teacher_behavior['end_time'] - 
                        df_teacher_behavior['start_time']).dt.total_seconds()) / 3600 * 2
    df_teacher_behavior.loc[df_teacher_behavior['teacher_status_for_lesson'] == 'no_show', 
                            'ask_for_leave_advanced_minutes'] = 0
    df_teacher_behavior['ask_for_leave_advanced_days'] = df_teacher_behavior['ask_for_leave_advanced_minutes'] / (60 * 24)
    # 天数衰减
    df_teacher_behavior['decay_index'] = ((end_time - 
            df_teacher_behavior['start_time']).dt.total_seconds()) / (3600 * 24) 
    df_teacher_behavior.loc[df_teacher_behavior['decay_index'] < 0, 'decay_index'] = 0
    # log计算，防止0值+1;取倒数+1，防止无穷情况
    df_teacher_behavior['decay_index'] = df_teacher_behavior['decay_index'] + 2
    df_teacher_behavior['decay_index'] = df_teacher_behavior['decay_index'].apply(lambda x: math.log(x, 10))
    df_teacher_behavior['decay_index'] = 1 / df_teacher_behavior['decay_index']
    # 上面校验完成，数值错误，所有lesson_count都只有0.25，正常值为1
    # check0.csv
    
    
    
    
    
    
    # count lesson types
    df_teacher_behavior_res = df_teacher_behavior[['awj_teacher_id']]
    df_teacher_behavior_res.drop_duplicates(keep='first', inplace=True)
    types = ['normal_lesson', 'late', 'no_show', 'abnormal_lesson', 'ask_for_leave']
    for itm in types:
        df_count = df_teacher_behavior.loc[
            df_teacher_behavior['teacher_status_for_lesson'].isin([itm])]
        df_count['log_lesson_count'] = df_count['lesson_count'] * df_count['decay_index']
        df_count = df_count.groupby(
            ['awj_teacher_id'], as_index=False)['log_lesson_count'].sum()
        df_count.reset_index()
        if (itm.find('ask_for_leave') > -1) | (itm.find('lesson') > -1):
            df_count.rename(columns={'log_lesson_count': itm + '_log_count'}, inplace=True)
        else:
            df_count.rename(columns={'log_lesson_count': itm + '_lesson_log_count'}, inplace=True)
        df_teacher_behavior_res = pd.merge(df_teacher_behavior_res, df_count, 
                                          on='awj_teacher_id', how='left')
    # check1.csv
    
    
    
    
    
    # 校验完成，没有问题
    # check2.csv
    # ask for leave advanced minutes
    df_advanced_days = df_teacher_behavior.loc[
        df_teacher_behavior['teacher_status_for_lesson'].isin(['ask_for_leave', 'no_show'])]  
    df_advanced_days['ask_for_leave_advanced_log_days'] = df_advanced_days['decay_index'] * df_advanced_days['ask_for_leave_advanced_days']
    # 求均值
    df_advanced_days_mean = df_advanced_days.groupby(['awj_teacher_id'], as_index=False).agg(
        {'ask_for_leave_advanced_log_days': np.sum, 'decay_index': np.sum})
    df_advanced_days_mean.reset_index(inplace=True)
    df_advanced_days_mean['advanced_days_log_mean'] = df_advanced_days_mean['ask_for_leave_advanced_log_days'] / df_advanced_days_mean['decay_index'] 
    df_advanced_days_mean = df_advanced_days_mean[['awj_teacher_id', 'advanced_days_log_mean']]
    # 求最大最小值及方差
    df_advanced_days_others = df_advanced_days.groupby(['awj_teacher_id'], 
            as_index=False)['ask_for_leave_advanced_days'].agg(['min', 'max', 'std'])
    df_advanced_days_others.reset_index(inplace=True)
    df_advanced_days_others.loc[df_advanced_days_others['std'].isnull(), 
                                'std'] = df_advanced_days_others['std'].mean()
    df_advanced_days_others.rename(columns={'min': 'advanced_days_min', 
                    'max': 'advanced_days_max', 'std': 'advanced_days_std'}, inplace=True)
    df_advanced_days = pd.merge(df_advanced_days_mean, df_advanced_days_others, 
                               on='awj_teacher_id', how='left')
    

    
    
    
    
    df_teacher_behavior_res = pd.merge(df_teacher_behavior_res, df_advanced_days, 
                                                       on='awj_teacher_id', how='left')
    # fill 0
    columns = ['normal_lesson_log_count', 'late_lesson_log_count', 'no_show_lesson_log_count', 
               'abnormal_lesson_log_count', 'ask_for_leave_log_count', 'advanced_days_log_mean', 
               'advanced_days_min', 'advanced_days_max', 'advanced_days_std']
    for itm in columns:
        df_teacher_behavior_res[itm].fillna(value=0, inplace=True)
    return df_teacher_behavior_res

In [4]:
def stu_comment(start_time, end_time, df_stu_comment, class_type_name_special, hq_name_special):
    # 筛选条件：时间筛选条件由于筛选后数量过少，所以采用全量数据
    df_stu_comment = df_stu_comment.loc[~df_stu_comment['机构'].isin(hq_name_special)]
    # 天数衰减
    df_stu_comment['decay_index'] = ((end_time - 
            df_stu_comment['评价时间']).dt.total_seconds()) / (3600 * 24) 
    df_stu_comment.loc[df_stu_comment['decay_index'] < 0, 'decay_index'] = 0
    # log计算，防止0值+1;取倒数+1，防止无穷情况
    df_stu_comment['decay_index'] = df_stu_comment['decay_index'] + 2
    df_stu_comment['decay_index'] = df_stu_comment['decay_index'].apply(lambda x: math.log(x, 10))
    df_stu_comment['decay_index'] = 1 / df_stu_comment['decay_index']  
    
    
    
    
    
    
    
    
    # 正面标签统计:有时一次课有多个好评标签，导致有些老师上课次数很少，但好评标签总量超过次数本身，不公平
    # 所以此处每堂课不管有几个好评标签，都算做一个好评统计
    df_good_label = df_stu_comment.loc[(df_stu_comment['学生评价星级'].isin(['5-star', '4-star'])) & 
            (df_stu_comment['标签内容'].isin(['老师有耐心', '课堂氛围好', '课程生动有趣', '互动丰富']))]
    df_good_label = df_good_label.groupby(['评价id'], as_index=False).first()
    df_good_label['标签内容'] = 1
    df_good_label['标签内容_processed'] = df_good_label['标签内容'] * df_good_label['decay_index']
    df_good_label = df_good_label.groupby(['awj_teacher_id'], 
                            as_index=False)['标签内容_processed'].sum()
    df_good_label.rename(columns={'标签内容_processed': 'stu_comment_log_good_behavior'}, inplace=True)
    # 负面标签统计：主观原因。客观原因---产品说环境原因可能是因为是视频供应商平台或学生自己网络问题，所以不应该计入
    # 去除4星及5星的数据，理论上，四星五星无负面评价，有的都是系统bug导致
    df_bad_label = df_stu_comment.loc[(~df_stu_comment['学生评价星级'].isin(['5-star', '4-star'])) & 
                    (df_stu_comment['标签内容'].isin(['老师语速过快', '互动较少', 
                    '课堂氛围差', '未及时纠正错误', '老师缺乏耐心']))]
    df_bad_label = df_bad_label.groupby(['评价id'], as_index=False).first()
    df_bad_label['标签内容'] = 1
    df_bad_label['标签内容_processed'] = df_bad_label['标签内容'] * df_bad_label['decay_index']
    df_bad_label = df_bad_label.groupby(['awj_teacher_id'], 
                                as_index=False)['标签内容_processed'].sum()
    df_bad_label.rename(columns={'标签内容_processed': 'stu_comment_log_bad_behavior'}, inplace=True)
    # merge
    df_stu_comment_res = pd.merge(df_good_label, df_bad_label, on='awj_teacher_id', how='outer')
    df_stu_comment_res['stu_comment_log_bad_behavior'].fillna(value=0, inplace=True)
    df_stu_comment_res['stu_comment_log_good_behavior'].fillna(value=0, inplace=True)
    return df_stu_comment_res

In [5]:
path = path = './FA_Model_data/'
# 时间周期半年
end_time = pd.to_datetime(datetime.datetime.now().date()) + MonthEnd(n=-1)
start_time = end_time - DateOffset(months=6)
class_type_name_special = ['Demo', '补课(非爱乐奇直属老师)', '托福班（30刀）', 'TOFEL', 
                          'VIP Writing/TOFEL（35刀）', 'Elite Pilot', 'Feeback Session', 
                          'New Teacher Test Class', '补课(爱乐奇直属老师)', 'Test Class', 
                          'Academic Meeting (Long)', 'Cur Experience Session-S', 
                          'Training-receiving', 'Cur Experience Session-L', 
                          'VIP Writing/TOFEL', 'Orientation Class', 'Academic Meeting', 
                          'Experience-receiving']
hq_name_special = ['test']
print('start_time:', start_time, '\n', 'end_time:', end_time)
print(type(start_time), type(end_time))
##### 老师QC明细表 #####
df_qc = pd.read_csv(path + '老师QC明细.csv', encoding='utf-8', sep=',')
# dtypes
df_qc['awj_teacher_id'] = df_qc['awj_teacher_id'].astype('int')
df_qc['score_recorded_at'] = pd.to_datetime(df_qc['score_recorded_at'])
df_qc['assigned_at'] = pd.to_datetime(df_qc['assigned_at'])
df_qc['start_time'] = pd.to_datetime(df_qc['start_time'])
df_qc['end_time'] = pd.to_datetime(df_qc['end_time'])
# drop duplicates
df_qc.drop_duplicates(subset=list(df_qc.columns), keep='first', inplace=True)
# sort
df_qc = df_qc.sort_values(by=['awj_teacher_id', 'start_time'], ascending=[1, 1])
##### 老师行为表 #####
df_teacher_behavior = pd.read_csv(path + '老师行为信息明细.csv', sep=',', encoding='utf-8')
# dtypes
df_teacher_behavior['awj_teacher_id'] = df_teacher_behavior['awj_teacher_id'].astype('int')
df_teacher_behavior['start_time'] = pd.to_datetime(df_teacher_behavior['start_time'])
df_teacher_behavior['end_time'] = pd.to_datetime(df_teacher_behavior['end_time'])
df_teacher_behavior['actual_start_time'] = pd.to_datetime(df_teacher_behavior['actual_start_time'])
df_teacher_behavior['actual_end_time'] = pd.to_datetime(df_teacher_behavior['actual_end_time'])
df_teacher_behavior.drop_duplicates(subset=list(df_teacher_behavior.columns), inplace=True)
# sort
df_teacher_behavior = df_teacher_behavior.sort_values(by=[
                    'awj_teacher_id', 'start_time'], ascending=[1, 1])
##### 老师信息表 #####
df_teacher_info = pd.read_csv(path + '老师基本信息.csv', sep=',', encoding='utf-8')
# dtypes
df_teacher_info['awj_teacher_id'] = df_teacher_info['awj_teacher_id'].astype(int)
df_teacher_info['创建时间'] = pd.to_datetime(df_teacher_info['创建时间'])
df_teacher_info['首次上架时间'] = pd.to_datetime(df_teacher_info['首次上架时间'])
df_teacher_info['首课时间'] = pd.to_datetime(df_teacher_info['首课时间'])
df_teacher_info.drop_duplicates(subset=list(df_teacher_info.columns), 
                                                keep='first', inplace=True)
# sort
df_teacher_info = df_teacher_info.sort_values(by=['awj_teacher_id'], ascending=[1])
df_teacher_info = df_teacher_info[['awj_teacher_id', 'state', '创建时间', 
                                                       '首次上架时间', '首课时间']]
##### 学生评价明细表 #####
df_stu_comment = pd.read_csv(path + '学生评价明细.csv', sep=',', encoding='utf-8')
# dtypes
df_stu_comment['awj_teacher_id'] = df_stu_comment['awj_teacher_id'].astype(int)
df_stu_comment['评价时间'] = pd.to_datetime(df_stu_comment['评价时间'])
df_stu_comment.drop_duplicates(subset=list(df_stu_comment.columns), keep='first', inplace=True)
# sort
df_stu_comment = df_stu_comment.sort_values(by=[
                'awj_teacher_id', '评价时间'], ascending=[1, 1])  


start_time: 2017-11-30 00:00:00 
 end_time: 2018-05-31 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'> <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [8]:
start_time

Timestamp('2017-11-30 00:00:00')

In [9]:
pd.to_datetime("2017-11-30 00:00:00")

Timestamp('2017-11-30 00:00:00')

In [6]:
datetime.datetime.now().date()

datetime.date(2018, 6, 4)

In [7]:
type(datetime.datetime.now().date())

datetime.date

# 提取用于测试的数据

In [194]:
# 筛选用于测试的数据

df_qc_res = teacher_qc_score(start_time, end_time, df_qc, 
                             class_type_name_special, hq_name_special)
df_teacher_behavior_res = teacher_behavior(start_time, 
                    end_time, df_teacher_behavior, class_type_name_special, hq_name_special)
df_stu_comment_res = stu_comment(start_time, end_time, df_stu_comment, 
                                 class_type_name_special, hq_name_special)

# 作为测试输入的几个csv

df_qc.to_csv("df_qc_test_input.csv", encoding="utf-8", index=False, float_format="%.5f")

df_teacher_behavior.to_csv("df_teacher_behavior_input.csv", encoding="utf-8", index=False, float_format="%.5f")

df_stu_comment.to_csv("df_stu_comment_input.csv", encoding="utf-8", index=False, float_format="%.5f")

df_teacher_info.to_csv("df_teacher_info_input.csv", encoding="utf-8", index=False, float_format="%.5f")

# 作为结果的几个csv

df_qc_res.to_csv("df_qc_test_result.csv", encoding="utf-8", index=False, float_format="%.5f")

df_teacher_behavior_res.to_csv("df_teacher_behavior_test_result.csv", encoding="utf-8", index=False, float_format="%.5f")

df_stu_comment_res.to_csv("df_stu_comment_test_result.csv", encoding="utf-8", index=False, float_format="%.5f")

data_test = pd.read_csv("df_qc_test_result.csv")

In [166]:
df_teacher_behavior_res

Unnamed: 0,awj_teacher_id,normal_lesson_log_count,late_lesson_log_count,no_show_lesson_log_count,abnormal_lesson_log_count,ask_for_leave_log_count,advanced_days_log_mean,advanced_days_min,advanced_days_max,advanced_days_std
0,69,176.411454,1.59241,0.0,0.0,2.401568,44.071698,38.747917,58.752778,8.51811
1,74,258.841721,0.442845,0.517948,0.474386,9.933976,30.883808,0.0,76.145139,26.56615
2,82,18.378615,5.866303,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,95,346.632047,0.515261,0.448737,0.0,5.236736,15.320447,0.0,35.589583,15.682193
4,97,319.11819,2.779069,1.516908,0.0,0.493566,31.241735,0.0,127.259028,63.629514


In [1477]:
if __name__ == '__main__':
    path = path = '/Users/roger.zhou/Downloads/星级老师/'
    # 时间周期半年
    end_time = pd.to_datetime(datetime.datetime.now().date()) + MonthEnd(n=-1)
    start_time = end_time - DateOffset(months=6)
    class_type_name_special = ['Demo', '补课(非爱乐奇直属老师)', '托福班（30刀）', 'TOFEL', 
                              'VIP Writing/TOFEL（35刀）', 'Elite Pilot', 'Feeback Session', 
                              'New Teacher Test Class', '补课(爱乐奇直属老师)', 'Test Class', 
                              'Academic Meeting (Long)', 'Cur Experience Session-S', 
                              'Training-receiving', 'Cur Experience Session-L', 
                              'VIP Writing/TOFEL', 'Orientation Class', 'Academic Meeting', 
                              'Experience-receiving']
    hq_name_special = ['test']
    print('start_time:', start_time, '\n', 'end_time:', end_time)
    ##### 老师QC明细表 #####
    df_qc = pd.read_csv(path + 'QC明细.csv', encoding='utf-8', sep=',')
    # dtypes
    df_qc['awj_teacher_id'] = df_qc['awj_teacher_id'].astype('int')
    df_qc['score_recorded_at'] = pd.to_datetime(df_qc['score_recorded_at'])
    df_qc['assigned_at'] = pd.to_datetime(df_qc['assigned_at'])
    df_qc['start_time'] = pd.to_datetime(df_qc['start_time'])
    df_qc['end_time'] = pd.to_datetime(df_qc['end_time'])
    # drop duplicates
    df_qc.drop_duplicates(subset=list(df_qc.columns), keep='first', inplace=True)
    # sort
    df_qc = df_qc.sort_values(by=['awj_teacher_id', 'start_time'], ascending=[1, 1])
    ##### 老师行为表 #####
    df_teacher_behavior = pd.read_csv(path + '老师行为信息.csv', sep=',', encoding='utf-8')
    # dtypes
    df_teacher_behavior['awj_teacher_id'] = df_teacher_behavior['awj_teacher_id'].astype('int')
    df_teacher_behavior['start_time'] = pd.to_datetime(df_teacher_behavior['start_time'])
    df_teacher_behavior['end_time'] = pd.to_datetime(df_teacher_behavior['end_time'])
    df_teacher_behavior['actual_start_time'] = pd.to_datetime(df_teacher_behavior['actual_start_time'])
    df_teacher_behavior['actual_end_time'] = pd.to_datetime(df_teacher_behavior['actual_end_time'])
    df_teacher_behavior.drop_duplicates(subset=list(df_teacher_behavior.columns), inplace=True)
    # sort
    df_teacher_behavior = df_teacher_behavior.sort_values(by=[
                        'awj_teacher_id', 'start_time'], ascending=[1, 1])
    ##### 老师信息表 #####
    df_teacher_info = pd.read_csv(path + '老师基本信息.csv', sep=',', encoding='utf-8')
    # dtypes
    df_teacher_info['awj_teacher_id'] = df_teacher_info['awj_teacher_id'].astype(int)
    df_teacher_info['创建时间'] = pd.to_datetime(df_teacher_info['创建时间'])
    df_teacher_info['首次上架时间'] = pd.to_datetime(df_teacher_info['首次上架时间'])
    df_teacher_info['首课时间'] = pd.to_datetime(df_teacher_info['首课时间'])
    df_teacher_info.drop_duplicates(subset=list(df_teacher_info.columns), 
                                                    keep='first', inplace=True)
    # sort
    df_teacher_info = df_teacher_info.sort_values(by=['awj_teacher_id'], ascending=[1])
    df_teacher_info = df_teacher_info[['awj_teacher_id', 'state', '创建时间', 
                                                           '首次上架时间', '首课时间']]
    ##### 学生评价明细表 #####
    df_stu_comment = pd.read_csv(path + '学生评价明细.csv', sep=',', encoding='utf-8')
    # dtypes
    df_stu_comment['awj_teacher_id'] = df_stu_comment['awj_teacher_id'].astype(int)
    df_stu_comment['评价时间'] = pd.to_datetime(df_stu_comment['评价时间'])
    df_stu_comment.drop_duplicates(subset=list(df_stu_comment.columns), keep='first', inplace=True)
    # sort
    df_stu_comment = df_stu_comment.sort_values(by=[
                    'awj_teacher_id', '评价时间'], ascending=[1, 1])  
    # 函数调用
    df_qc_res = teacher_qc_score(start_time, end_time, df_qc, 
                                 class_type_name_special, hq_name_special)
    df_teacher_behavior_res = teacher_behavior(start_time, 
                        end_time, df_teacher_behavior, class_type_name_special, hq_name_special)
    df_stu_comment_res = stu_comment(start_time, end_time, df_stu_comment, 
                                     class_type_name_special, hq_name_special)
    # 宽表
    df_wide = pd.merge(df_teacher_info, df_teacher_behavior_res, on='awj_teacher_id', how='left')
    df_wide = pd.merge(df_wide, df_qc_res, on='awj_teacher_id', how='left')
    df_wide = pd.merge(df_wide, df_stu_comment_res, on='awj_teacher_id', how='left')
    # 学生评价---采用全量数据，需用到全量老师上课数值
    df_lesson_count_all = teacher_behavior(end_time - DateOffset(months=360), end_time, 
                            df_teacher_behavior, class_type_name_special, hq_name_special)
    df_lesson_count_all = df_lesson_count_all[['awj_teacher_id', 'normal_lesson_log_count', 
                                        'late_lesson_log_count', 'no_show_lesson_log_count', 
                                        'ask_for_leave_log_count', 'abnormal_lesson_log_count']]
    df_lesson_count_all.rename(columns={'normal_lesson_log_count': 'normal_lesson_log_count_all', 
                                    'late_lesson_log_count': 'late_lesson_log_count_all', 
                                    'no_show_lesson_log_count': 'no_show_lesson_log_count_all', 
                                    'ask_for_leave_log_count': 'ask_for_leave_log_count_all', 
                                    'abnormal_lesson_log_count': 'abnormal_lesson_log_count_all'}, 
                                    inplace=True)
    df_wide = pd.merge(df_wide, df_lesson_count_all, on='awj_teacher_id', how='left')
    # 衍生新字段
    df_wide['log_ask_for_leave/log_normal_lesson'] = df_wide['ask_for_leave_log_count'] / df_wide['normal_lesson_log_count']
    df_wide['abnormal_all_log_percent'] = (df_wide['no_show_lesson_log_count'] + 
                                      df_wide['late_lesson_log_count'] + 
                                      df_wide['abnormal_lesson_log_count'] + 
                                      df_wide['ask_for_leave_log_count']) / (
                                      df_wide['normal_lesson_log_count'] + 
                                      df_wide['no_show_lesson_log_count'] + 
                                      df_wide['late_lesson_log_count'] + 
                                      df_wide['abnormal_lesson_log_count'] + 
                                      df_wide['ask_for_leave_log_count'])
    df_wide['lesson_time_range'] = ((end_time - df_wide['首课时间']).dt.total_seconds()) / (3600 * 24) 
    df_wide.loc[df_wide['lesson_time_range'] == 0, 'lesson_time_range'] = ((end_time - df_wide['创建时间']).dt.total_seconds()) / (3600 * 24)  
    # 判断是否是新老师
    df_wide['old_new_teacher'] = 'old'
    df_wide.loc[(df_wide['首课时间'].isnull()) & 
                (df_wide['state'].isin(['oboard', 'active'])), 'old_new_teacher'] = 'new'
    # 缺失值填补
    # 无用字段去除
    df_wide.drop(['创建时间', '首次上架时间', '首课时间', 'state'], axis=1, inplace=True)
    columns = list(df_wide.columns)
    columns.pop(columns.index('awj_teacher_id'))
    columns.pop(columns.index('old_new_teacher'))
    # new teacher: mean
    for itm in columns:
        df_wide.loc[df_wide['old_new_teacher'] == 'new', itm] = df_wide.loc[
            (df_wide['normal_lesson_log_count'] > 0), itm].mean()
    # old teacher:0
    df_wide.fillna(value=0, inplace=True)
    new_teacher = df_wide.loc[df_wide['old_new_teacher'] == 'new', 'awj_teacher_id']
    # 有些老师没有请过假，advanced_days字段为0，填为均值
    df_wide.loc[(df_wide['ask_for_leave_log_count'] == 0) & (df_wide['normal_lesson_log_count'] > 0), 
        'advanced_days_max'] = df_wide.loc[(df_wide['advanced_days_max'] != 0) & 
                        df_wide['normal_lesson_log_count'] > 0, 'advanced_days_max'].mean()
    df_wide.loc[(df_wide['ask_for_leave_log_count'] == 0) & (df_wide['normal_lesson_log_count'] > 0), 
        'advanced_days_log_mean'] = df_wide.loc[(df_wide['advanced_days_log_mean'] != 0) & 
                        df_wide['normal_lesson_log_count'] > 0, 'advanced_days_log_mean'].mean()
    # 平滑
    smooth = ['late_lesson_log_count', 'no_show_lesson_log_count', 'abnormal_lesson_log_count', 
              'ask_for_leave_log_count', 'stu_comment_log_bad_behavior']
    for itm in smooth:
        df_wide[itm] = df_wide[itm] + math.log(2, 10)
    # 字段处理
    # 比例计算
    df_wide['late_lesson_log_percent'] = df_wide['late_lesson_log_count'] / df_wide['normal_lesson_log_count']
    df_wide['no_show_lesson_log_percent'] = df_wide['no_show_lesson_log_count'] / df_wide['normal_lesson_log_count']
    df_wide['abnormal_lesson_log_percent'] = df_wide['abnormal_lesson_log_count'] / df_wide['normal_lesson_log_count']
    df_wide['ask_for_leave_log_percent'] = df_wide['ask_for_leave_log_count'] / df_wide['normal_lesson_log_count']
    # 查看上课异常行为的分位数
    columns = ['no_show_lesson_log_percent', 'late_lesson_log_percent', 
               'abnormal_lesson_log_percent', 'ask_for_leave_log_percent', 
               'abnormal_all_log_percent', 'log_ask_for_leave/log_normal_lesson', 
               'advanced_days_log_mean', 'advanced_days_max']
    quantiles = [0.1, 0.15, 0.2, 0.25, 0.3, 0.33, 0.5, 0.55, 0.6, 0.66, 0.7, 0.75, 
                 0.77, 0.8, 0.83, 0.85, 0.9, 0.92, 0.95, 1]
    for i in range(len(columns)):
        print('\n', columns[i], ':')
        for itm in quantiles:
            print(str(itm), ':', df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                                        columns[i]].quantile(itm))
    # 老师异常行为有一项出现较大异常值时或整体较差，normal_lesson_log_count降为相应较低数值，整体表现变差
    columns = ['ask_for_leave_log_percent', 'late_lesson_log_percent', 
               'no_show_lesson_log_percent', 'abnormal_lesson_log_percent', 
               'abnormal_all_log_percent']
    cache_columns = ['normal_lesson_log_count_processed_ask_for_leave_cahce1', 
                     'normal_lesson_log_count_processed_late_lesson_cache2', 
                     'normal_lesson_log_count_processed_no_show_cache3', 
                     'normal_lesson_log_count_processed_abnormal_lesson_cache4', 
                     'normal_lesson_log_count_processed_abnormal_all_cahce5'
                    ]
    for itm in cache_columns:
        df_wide[itm] = df_wide['normal_lesson_log_count']
    df_wide['normal_lesson_log_count_processed'] = df_wide['normal_lesson_log_count']
    # 分位数
    quantiles = [[0.6, 0.7, 0.75, 0.8, 1], [0.8, 0.9, 0.92, 0.95, 1], 
                [0.5, 0.6, 0.66, 0.75, 0.8, 0.85, 0.9, 1], [0.8, 0.85, 0.9, 0.95, 1], 
                [0.55, 0.66, 0.75, 0.85, 1]]
    # 降低比例
    indexes = [[0.8, 0.6, 0.3, 0.1], [0.8, 0.7, 0.3, 0.1], 
              [0.8, 0.75, 0.6, 0.5, 0.3, 0.1, 0.03], [0.9, 0.85, 0.7, 0.3], 
              [0.8, 0.7, 0.3, 0.1]]
    for i in range(len(columns)):
        for k in range(len(quantiles[i]) - 1):
            standard1 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                    columns[i]].quantile(quantiles[i][k])
            standard2 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                    columns[i]].quantile(quantiles[i][k + 1])
            # 降低正常上课的数量
            df_wide.loc[(df_wide['normal_lesson_log_count'] > 0) & 
                (df_wide[columns[i]] > standard1) & (df_wide[columns[i]] <= standard2), 
                cache_columns[i]] = df_wide['normal_lesson_log_count'] * indexes[i][k]
    # 从5行cache_colume中取最小值
    df_wide['normal_lesson_log_count_processed'] = df_wide[cache_columns].min(axis=1)
    # 大小方向统一化
    # 重新计算四个percent
    df_wide['late_lesson_log_percent_processed'] = df_wide['late_lesson_log_count'] / df_wide['normal_lesson_log_count_processed']
    df_wide['no_show_lesson_log_percent_processed'] = df_wide['no_show_lesson_log_count'] / df_wide['normal_lesson_log_count_processed']
    df_wide['abnormal_lesson_log_percent_processed'] = df_wide['abnormal_lesson_log_count'] / df_wide['normal_lesson_log_count_processed']
    df_wide['ask_for_leave_log_percent_processed'] = df_wide['ask_for_leave_log_count'] / df_wide['normal_lesson_log_count_processed']
    df_wide['normal_log_lesson_per_week'] = df_wide['normal_lesson_log_count_processed'] / (df_wide['lesson_time_range'] / 7) 
    # 取倒数
    rcp = ['late_lesson_log_processed', 'no_show_lesson_log_processed', 
           'abnormal_lesson_log_processed', 'ask_for_leave_log_processed', 
           'stu_comment_log_bad_behavior_processed']
    cols = ['late_lesson_log_percent_processed', 'no_show_lesson_log_percent_processed', 
            'abnormal_lesson_log_percent_processed', 'ask_for_leave_log_percent_processed', 
            'stu_comment_log_bad_behavior']
    for i in range(len(rcp)):
        df_wide[rcp[i]] = 1 / df_wide[cols[i]]
    # 修正老师上课数量少但好评较多的情况（如老师id642）
    df_wide['stu_comment_log_good_behavior_processed'] = df_wide['stu_comment_log_good_behavior'] / (
                                df_wide['normal_lesson_log_count_all'] 
                                + df_wide['late_lesson_log_count_all'] 
                                + df_wide['no_show_lesson_log_count_all'] 
                                + df_wide['ask_for_leave_log_count_all'] 
                                + df_wide['abnormal_lesson_log_count_all'])                    
    df_wide['stu_comment_log_bad_behavior_processed'] = df_wide['stu_comment_log_bad_behavior'] / (
                                df_wide['normal_lesson_log_count_all']
                                + df_wide['late_lesson_log_count_all'] 
                                + df_wide['no_show_lesson_log_count_all'] 
                                + df_wide['ask_for_leave_log_count_all'] 
                                + df_wide['abnormal_lesson_log_count_all'])
    # 修正老师请假次数过多，但提前请假天数指标过好的情况（如老师id642） 
    columns = ['advanced_days_log_mean', 'advanced_days_max']
    quantiles = [0.6, 0.66, 0.7, 0.75, 0.77, 0.8, 0.85, 1]
    indexes = [0.9, 0.8, 0.7, 0.5, 0.3, 0.1, 0.05]
    for i in range(len(columns)):
        for k in range(len(quantiles) - 1):
            percent1 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                'log_ask_for_leave/log_normal_lesson'].quantile(quantiles[k])
            percent2 = df_wide.loc[df_wide['normal_lesson_log_count'] > 0, 
                                'log_ask_for_leave/log_normal_lesson'].quantile(quantiles[k + 1])
            df_wide.loc[(df_wide['normal_lesson_log_count'] > 0) & 
                        (df_wide['log_ask_for_leave/log_normal_lesson'] > percent1) & 
                        (df_wide['log_ask_for_leave/log_normal_lesson'] <= percent2), 
                        columns[i]] = df_wide[columns[i]] * indexes[k]
    # 无上课记录老师
    columns = list(df_wide.columns)
    columns.pop(columns.index('awj_teacher_id'))
    for itm in columns:
        df_wide.loc[df_wide['normal_lesson_log_count'] == 0, itm] = 0
    # 只取有行为数据的
    df_wide_final = df_wide.loc[df_wide['normal_lesson_log_count'] > 0]
    # delete columns
    df_wide_final.drop(['advanced_days_std', 'teacher_score_std', 
                  'advanced_days_min', 'lesson_time_range', 
                  'normal_lesson_log_count_all', 'no_show_lesson_log_count_all', 
                  'abnormal_lesson_log_count_all', 'late_lesson_log_count_all', 
                  'ask_for_leave_log_count_all', 'old_new_teacher', 
                  'teacher_qc_count', 'normal_lesson_log_count', 'late_lesson_log_count', 
                  'no_show_lesson_log_count', 'abnormal_lesson_log_count', 
                  'ask_for_leave_log_count', 'stu_comment_log_good_behavior', 
                  'stu_comment_log_bad_behavior', 
                  'log_ask_for_leave/log_normal_lesson',  'abnormal_lesson_log_percent',
                  'late_lesson_log_percent', 'no_show_lesson_log_percent', 
                  'ask_for_leave_log_percent', 
                  'late_lesson_log_percent_processed', 'no_show_lesson_log_percent_processed', 
                  'abnormal_lesson_log_percent_processed', 'ask_for_leave_log_percent_processed', 
                  'normal_lesson_log_count_processed', 'abnormal_all_log_percent', 
                  'normal_lesson_log_count_processed_ask_for_leave_cahce1', 
                  'normal_lesson_log_count_processed_late_lesson_cache2', 
                  'normal_lesson_log_count_processed_no_show_cache3', 
                  'normal_lesson_log_count_processed_abnormal_lesson_cache4', 
                  'normal_lesson_log_count_processed_abnormal_all_cahce5', 
                  'stu_comment_log_bad_behavior_processed'], 
                   axis=1, inplace=True)
    # save
    df_wide_final.to_csv(path + 'df_wide_log_final.csv', sep=',', index=False, encoding='utf-8')

start_time: 2017-10-30 00:00:00 
 end_time: 2018-04-30 00:00:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/


 no_show_lesson_log_percent :
0.1 : 0.0020693902230976813
0.15 : 0.002609482028417154
0.2 : 0.003964195090900766
0.25 : 0.004739624656660977
0.3 : 0.006327321111360524
0.33 : 0.007136674776301296
0.5 : 0.013710653684948665
0.55 : 0.014148120207056225
0.6 : 0.017418628643968127
0.66 : 0.023705240734450295
0.7 : 0.028664188771738246
0.75 : 0.036109686930670355
0.77 : 0.03942409242437905
0.8 : 0.04819983685479876
0.83 : 0.06521305545024908
0.85 : 0.07564749226786593
0.9 : 0.15981524252166543
0.92 : 0.20417720083589402
0.95 : 0.34051246162132914
1 : 11.652416093857537

 late_lesson_log_percent :
0.1 : 0.0019740275570543455
0.15 : 0.002550527802458655
0.2 : 0.0033418253339256546
0.25 : 0.004424202735508408
0.3 : 0.005846869714429388
0.33 : 0.0065814645326652665
0.5 : 0.011874154023525919
0.55 : 0.015700311242986334
0.6 : 0.02342876956514011
0.66 : 0.030686998106074694
0.7 : 0.030686998106074694
0.75 : 0.03903035881575523
0.77 : 0.045535703325956314
0.8 : 0.059702570039255956
0.83 : 0.07263

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [1480]:
df_wide.loc[df_wide['awj_teacher_id'] ==864]

Unnamed: 0,awj_teacher_id,normal_lesson_log_count,late_lesson_log_count,no_show_lesson_log_count,abnormal_lesson_log_count,ask_for_leave_log_count,advanced_days_log_mean,advanced_days_min,advanced_days_max,advanced_days_std,teacher_score_max,teacher_score_min,teacher_qc_count,teacher_score_std,log_decay_score_mean,stu_comment_log_good_behavior,stu_comment_log_bad_behavior,normal_lesson_log_count_all,late_lesson_log_count_all,no_show_lesson_log_count_all,ask_for_leave_log_count_all,abnormal_lesson_log_count_all,log_ask_for_leave/log_normal_lesson,abnormal_all_log_percent,lesson_time_range,old_new_teacher,late_lesson_log_percent,no_show_lesson_log_percent,abnormal_lesson_log_percent,ask_for_leave_log_percent,normal_lesson_log_count_processed_ask_for_leave_cahce1,normal_lesson_log_count_processed_late_lesson_cache2,normal_lesson_log_count_processed_no_show_cache3,normal_lesson_log_count_processed_abnormal_lesson_cache4,normal_lesson_log_count_processed_abnormal_all_cahce5,normal_lesson_log_count_processed,late_lesson_log_percent_processed,no_show_lesson_log_percent_processed,abnormal_lesson_log_percent_processed,ask_for_leave_log_percent_processed,normal_log_lesson_per_week,late_lesson_log_processed,no_show_lesson_log_processed,abnormal_lesson_log_processed,ask_for_leave_log_processed,stu_comment_log_bad_behavior_processed,stu_comment_log_good_behavior_processed
676,864,171.147297,1.250313,3.730602,0.30103,77.142791,17.622861,0.0,42.92375,39.608622,4.67,3.67,34.0,0.214462,4.149045,0.0,0.30103,189.095325,0.949283,3.429572,96.506531,0.0,0.44898,0.321834,281.291667,old,0.007305,0.021798,0.001759,0.450739,17.11473,171.147297,128.360473,171.147297,51.344189,17.11473,0.073055,0.217976,0.017589,4.507392,0.425904,13.688357,4.587659,56.853901,0.221858,0.001038,0.0


In [1481]:
# FA_indexs
fa_index = pd.read_excel(path + 'fa_indexs.xlsx')
fa_index

Unnamed: 0,fa_index1,fa_index2,fa_index3,fa_index4
0,-0.069757,-0.009464,0.542764,0.039315
1,-0.085189,-0.009756,0.550396,0.020481
2,-0.012225,0.340463,-0.014839,0.009207
3,-0.051254,0.348222,0.0056,0.016444
4,-0.03004,0.353244,-0.012207,-0.003987
5,0.279596,-0.019408,-0.089962,-0.078174
6,0.238288,-0.024801,-0.018852,0.035737
7,0.247545,-0.0188,-0.012641,0.089345
8,0.26538,-0.01425,-0.040761,0.125086
9,0.222805,-0.039292,-0.085099,-0.15195


In [1482]:
fa_index = fa_index.as_matrix()
df_wide_matrix = deepcopy(df_wide_final)
df_wide_matrix.drop(['awj_teacher_id'], axis=1, inplace=True)
df_wide_matrix = df_wide_matrix.as_matrix()
# df_wide标准化
df_wide_matrix = preprocessing.scale(df_wide_matrix)
# 每个老师的各因子得分
fa_score = np.dot(df_wide_matrix, fa_index)
# 主成分贡献率
var = np.array([[0.38102 / 0.83845], 
                [0.23120 / 0.83845], 
                [0.13673 / 0.83845], 
                [0.08950 / 0.83845]])
# 每个老师的最终得分
final_score = np.dot(fa_score, var)

In [1484]:
# df格式
teacher_fa_score = np.hstack((fa_score, final_score))
teacher_fa_score = pd.DataFrame(teacher_fa_score)
teacher_fa_score['awj_teacher_id'] = list(df_wide_final['awj_teacher_id'])
teacher_fa_score.rename(columns={0: 'teacher_behavior_score', 1: 'teacher_qc_score', 
                                 2:'teacher_attitude_score', 3: 'student_comment_score', 
                                 4: 'final_score'}, inplace=True)
# 星级映射(去除过去一段时间周期内没上过课的老师)
teacher_fa_score = teacher_fa_score.sort_values(by='final_score', ascending=0)
# 业务要求的分位数
star_5 = teacher_fa_score['final_score'].quantile(0.8)
star_4 = teacher_fa_score['final_score'].quantile(0.5)
star_3 = teacher_fa_score['final_score'].quantile(0.2)
star_2 = teacher_fa_score['final_score'].quantile(0.1)
print('5:', star_5, '4:', star_4, '3:', star_3, '2:', star_2)
teacher_fa_score.loc[teacher_fa_score['final_score'] <= star_2, 'star'] = 1
teacher_fa_score.loc[(teacher_fa_score['final_score'] > star_2) & (teacher_fa_score['final_score'] <= star_3), 'star'] = 2
teacher_fa_score.loc[(teacher_fa_score['final_score'] > star_3) & (teacher_fa_score['final_score'] <= star_4), 'star'] = 3
teacher_fa_score.loc[(teacher_fa_score['final_score'] > star_4) & (teacher_fa_score['final_score'] <= star_5), 'star'] = 4
teacher_fa_score.loc[teacher_fa_score['final_score'] > star_5, 'star'] = 5
# 拼回去
teacher_fa_score = pd.merge(teacher_fa_score, df_wide[
    ['awj_teacher_id', 'normal_lesson_log_count']], on='awj_teacher_id', how='right')
teacher_fa_score.fillna(value=0, inplace=True)
# 没上过课的老老师都统一补为3星
teacher_fa_score['star'] = teacher_fa_score['star'].replace({0: 3})
teacher_fa_score.to_csv(path + 'teacher_star.csv', sep=',', encoding='utf-8', index=False)

5: 0.4580994798537676 4: -0.07392365883945255 3: -0.39442981768932417 2: -0.4814437906579389
