### 第二版数据在处理时，有些跟第一版的不太一样，比如“爱奇艺播放指数”，之间没做任何处理，现在将“+11111111”用后一位替代；top10导演的名字也发生变化；推广位的值从之前的3列扩充到了7列，生成了一个模型。
### 发现用第二版的模型预测第一版的20%测试集的时候，误差比之前大了10倍，无法解释，故将第二版数据的特征重新处理，与第一版的处理方式保持一致，仅变化一处，将推广位从之前的3个离散值变成7个离散值，其他的不变，再看rmse是否在一个数量级。

### 1. 加载数据

In [1]:
import pandas as pd
import numpy as np
import calendar
import datetime

In [41]:
df = pd.read_csv("./data/filter_time_data.txt_new", sep="\t\t", header=None)

  """Entry point for launching an IPython kernel.


In [42]:
df.columns = ['电影ID', '时间', '当前热度值', '新增热度值', '当前点击量', '新增点击量', '当前评论数', '新增评论数', '当前点赞数', '新增点赞数',
              '当前踩数', '新增踩数', '当前评分数', '推广位', '影片推广位等级', '当前小时弹幕数', '当前小时新增弹幕数', '当前小时评分人数',
              '当前小时新增评分人数', '当前小时五星评分人数', '当前小时四星评分人数', '当前小时三星评分人数', '当前小时二星评分人数',
              '当前小时一星评分人数', '当日爱奇艺播放指数', '影片上线时间', '导演', '编剧', '制片', '美术', '主演']

In [43]:
df.shape

(959866, 31)

In [44]:
df['时间'].unique()

array(['2018-09-04 17:00:00', '2018-09-04 18:00:00',
       '2018-09-04 19:00:00', '2018-09-04 20:00:00',
       '2018-09-04 21:00:00', '2018-09-04 22:00:00',
       '2018-09-04 23:00:00', '2018-09-05 00:00:00',
       '2018-09-05 01:00:00', '2018-09-05 02:00:00',
       '2018-09-05 03:00:00', '2018-09-05 04:00:00',
       '2018-09-05 05:00:00', '2018-09-05 06:00:00',
       '2018-09-05 07:00:00', '2018-09-05 08:00:00',
       '2018-09-05 09:00:00', '2018-09-05 10:00:00',
       '2018-09-05 11:00:00', '2018-09-05 12:00:00',
       '2018-09-05 13:00:00', '2018-09-05 14:00:00',
       '2018-09-05 15:00:00', '2018-09-05 16:00:00',
       '2018-09-05 17:00:00', '2018-09-05 18:00:00',
       '2018-09-05 19:00:00', '2018-09-05 20:00:00',
       '2018-09-05 21:00:00', '2018-09-05 22:00:00',
       '2018-09-05 23:00:00', '2018-09-06 00:00:00',
       '2018-09-06 01:00:00', '2018-09-06 02:00:00',
       '2018-09-06 03:00:00', '2018-09-06 04:00:00',
       '2018-09-06 05:00:00', '2018-09-06 06:0

### 2. 补全“当前小时评分人数”的缺失值

In [5]:
df['当前小时评分人数'].replace(-111111111.0, np.nan, inplace=True)
df['当前小时评分人数'] = df['当前小时评分人数'].fillna(method="bfill")

### 3. 增加特征

#### 3.1 增加“time_interval”

In [6]:
def get_time_interval(x, y):
    '''
        获取影片已经上映多久；x:现在时间； y:影片上映时间；
    '''
    from dateutil.parser import parse
    a = parse(x)
    b = parse(y)
    return (a - b).total_seconds()

In [7]:
df['time_interval'] = df.apply(
        lambda row: get_time_interval(row['时间'], row['影片上线时间']), axis=1)

#### 3.2 高频导演/主演名，按第一次的

In [8]:
top_10_directors = ['巴晨旭', '郭靖', '潘文杰', '朱锐斌', '唐顺风', '干志文', '薛少', '褚会林', '诸佳倩', '胡正兵']
top_8_actors = ['张伟', '刘洋', '张浩', '李伟', '王伟', '张磊', '冯芷墨', '张涛']

In [9]:
for name in top_10_directors + top_8_actors:
    df[name] = 0

In [10]:
combine_name_number = {}
number = -18  # -18取决现在dataframe的列的情况
for name in top_10_directors + top_8_actors:
    combine_name_number[name] = number
    number += 1

In [11]:
def transfer0to1(data, name_type, name_list, dic):
    '''
        将新增特征列中，将属于高频导演/高频主演/推广位的特征，从0转成1
        "name_type":“导演”或“主演”或“”推广位
        "name_list": 高频导演/高频主演/推广位的总名单
        "dic": 新特征列名与数字索引对应的字典
    '''
    for num, item in enumerate(data[name_type]):

        if item and len(str(item).split(",")) == 1:
            if item in name_list:
                data.iloc[num, dic[item]] = 1

        if item and len(str(item).split(",")) > 1:
            for n in item.split(","):
                if n in name_list:
                    data.iloc[num, dic[n]] = 1
                else:
                    continue

In [12]:
transfer0to1(df, "导演", top_10_directors + top_8_actors, combine_name_number)
transfer0to1(df, "主演", top_10_directors + top_8_actors, combine_name_number)

#### 3.3 “is_weekday”判断是否是工作日

In [13]:
def get_weekday(x):
    '''
        将日期转化为对应的星期数
    '''
    from datetime import datetime
    dateString = x.split()[0]
    week_day = datetime.strptime(dateString, "%Y-%m-%d").weekday()
    return calendar.day_name[week_day]

In [14]:
def is_weekday(day):
    '''
    判断是否是工作日,是为1，不是为0。
    '''
    if day == "Saturday" or day == "Sunday":
        return 0
    else:
        return 1

In [15]:
df['weekday'] = df['时间'].apply(get_weekday)
df['is_weekday'] = df['weekday'].apply(is_weekday)

#### 3.4. “推广位”进行one-hot编码

In [18]:
for name in ['VIP-焦点', 'VIP-电影', '无', '电影-焦点', '电影-电影', '电影-网大', '精选-电影']:
    df[name] = 0

In [19]:
# 将列名与数字索引对应起来
combine_name_number_2 = {}
number = -3
for name in ['VIP-焦点', 'VIP-电影', '无', '电影-焦点', '电影-电影', '电影-网大', '精选-电影']:
    combine_name_number_2[name] = number
    number += 1
# 将符合条件的从0置为1
transfer0to1(df,
             "推广位",
             ['VIP-焦点', 'VIP-电影', '无', '电影-焦点', '电影-电影', '电影-网大', '精选-电影'],
             combine_name_number_2)

#### 3.5 “week_day”进行one-hot编码，具体到每周第几天，训练特征时，直接用get_dummies，比transfer0to1快很多，如果需要预测新样本时，再transfer0to1

In [20]:
for col in pd.get_dummies(df['weekday']).columns:
    df[col] = pd.get_dummies(df['weekday'])[col]

#### 3.6“推广位等级”进行one-hot编码，推广位等级分类

In [21]:
for col in pd.get_dummies(df['影片推广位等级']).columns:
    df[col] = pd.get_dummies(df['影片推广位等级'])[col]

#### 3.7 时间区间特征

In [22]:
def get_hour(x):
    '''
        用“时间”的时间戳补充新特征小时"hour"
    '''
    return x.split()[1].split(":")[0]

In [23]:
def get_time_partition(x):
    '''
        将时间点转化为对应的时间区间
    '''
    x = int(x)
    if x in range(1, 11):   # 1:00-10:00
        return 0
    if x in range(11, 21):  # 11:00-20:00
        return 1
    else:                   # 21:00-0:00
        return 2

In [27]:
## 调整列顺序
df = df[['电影ID', '时间', '当前热度值', '新增热度值', '当前点击量', '新增点击量', '当前评论数', '新增评论数',
       '当前点赞数', '新增点赞数', '当前踩数', '新增踩数', '当前评分数', '推广位', '影片推广位等级', '当前小时弹幕数',
       '当前小时新增弹幕数', '当前小时评分人数', '当前小时新增评分人数', '当前小时五星评分人数', '当前小时四星评分人数',
       '当前小时三星评分人数', '当前小时二星评分人数', '当前小时一星评分人数', '当日爱奇艺播放指数', '影片上线时间', '导演',
       '编剧', '制片', '美术', '主演', 'time_interval', '巴晨旭', '郭靖', '潘文杰', '朱锐斌',
       '唐顺风', '干志文', '薛少', '褚会林', '诸佳倩', '胡正兵', '张伟', '刘洋', '张浩', '李伟', '王伟',
       '张磊', '冯芷墨', '张涛', 'weekday', 'is_weekday', 'VIP-焦点', 'VIP-电影', '无',
       '电影-焦点', '电影-电影', '电影-网大', '精选-电影','Monday','Tuesday', 'Wednesday','Thursday','Friday',
       'Saturday', 'Sunday', 'a', 'b', 'c']] 

In [39]:
df2 = df

In [40]:
df2['时间'].unique()

array(['2018-09-04 17:00:00', '2018-09-04 18:00:00',
       '2018-09-04 19:00:00', '2018-09-04 20:00:00',
       '2018-09-04 21:00:00', '2018-09-04 22:00:00',
       '2018-09-04 23:00:00', '2018-09-05 00:00:00',
       '2018-09-05 01:00:00', '2018-09-05 02:00:00',
       '2018-09-05 03:00:00', '2018-09-05 04:00:00',
       '2018-09-05 05:00:00', '2018-09-05 06:00:00',
       '2018-09-05 07:00:00', '2018-09-05 08:00:00',
       '2018-09-05 09:00:00', '2018-09-05 10:00:00',
       '2018-09-05 11:00:00', '2018-09-05 12:00:00',
       '2018-09-05 13:00:00', '2018-09-05 14:00:00',
       '2018-09-05 15:00:00', '2018-09-05 16:00:00',
       '2018-09-05 17:00:00', '2018-09-05 18:00:00',
       '2018-09-05 19:00:00', '2018-09-05 20:00:00',
       '2018-09-05 21:00:00', '2018-09-05 22:00:00',
       '2018-09-05 23:00:00', '2018-09-06 00:00:00',
       '2018-09-06 01:00:00', '2018-09-06 02:00:00',
       '2018-09-06 03:00:00', '2018-09-06 04:00:00',
       '2018-09-06 05:00:00', '2018-09-06 06:0

In [37]:
df[df['时间']==1]

Unnamed: 0,电影ID,时间,当前热度值,新增热度值,当前点击量,新增点击量,当前评论数,新增评论数,当前点赞数,新增点赞数,...,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,a,b,c
944723,57932,1,1429.0,234.0,33107.0,11385.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944724,57932,1,1683.0,254.0,50689.0,17582.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944725,57932,1,1878.0,195.0,69555.0,18866.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944726,57932,1,2073.0,195.0,95962.0,26407.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944727,57932,1,2264.0,191.0,123737.0,27775.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944728,57932,1,2391.0,127.0,159228.0,35491.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944729,57932,1,2482.0,91.0,182231.0,23003.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
944730,57932,1,2540.0,58.0,197141.0,14910.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
944731,57932,1,2572.0,32.0,207009.0,9868.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
944732,57932,1,2594.0,22.0,212308.0,5299.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0


In [34]:
df["hour"] = df["时间"].apply(get_hour)
# df['time_partition'] = df['hour'].apply(get_time_partition)

AttributeError: 'int' object has no attribute 'split'

In [None]:
for col in pd.get_dummies(df['time_partition']).columns:
    df[col] = pd.get_dummies(df['time_partition'])[col]

#### 3.8 “hour”进行one-hot编码

In [None]:
for col in pd.get_dummies(df['hour']).columns:
    df[col] = pd.get_dummies(df['hour'])[col]

#### 3.9 以6为时间窗口，获取每6条“当前热度值”的平均值

In [None]:
def MaxMinNormalization(x):
    '''
        "当前热度值"归一化处理，0为该列min,6304为该列max
    '''

    x = (float(x) - 0) / (6304 - 0)
    return x

In [None]:
df['当前热度值'] = df['当前热度值'].apply(MaxMinNormalization)

In [None]:
def get_heat_values_windows_size_6(data):
    new_now_heat_value = []
    for i in range(data.shape[0]):
        new_now_heat_value.append(data.iloc[i:i + 6, :]['当前热度值'].mean())
    for i in range(5):
        new_now_heat_value.insert(i, 0)
        
    return new_now_heat_value

In [None]:
all_new_now_heat_value = []
for movie_id in df['电影ID'].unique().tolist():
    print("movie id:", movie_id)
    all_new_now_heat_value.extend(get_heat_values_windows_size_6(df[df['电影ID']==movie_id]))
    print("length of new list:",len(all_new_now_heat_value))