### 基于内容的召回

In [2]:
import os
import operator

In [3]:
movies_file = "F:\\badou\\tmp\\data\\recommender\\data\\ml-latest-small\\movies.csv"
ratings_file = "F:\\badou\\tmp\\data\\recommender\\data\\ml-latest-small\\ratings.csv"

In [4]:
def get_avg_score(input_file):
    '''
    通过用户评分文件计算每个item的平均评分
    '''
    if not os.path.exists(input_file):
        return {}
    
    fp = open(input_file, "r", encoding="utf-8")
    line_num = 0
    # 记录每一个评分
    record = {}
    # 返回值
    avg_score = {}
    
    for line in fp:
        # 跳过数据头
        if line_num == 0:
            line_num += 1
            continue
            
        item = line.strip().split(",")
        if len(item) < 4:
            continue
        
        user_id, item_id, rating = item[0], item[1], float(item[2])
        if item_id not in record:
            # [总的评分，参与评分的用户个数]
            record[item_id] = [0, 0]
        record[item_id][0] += rating
        record[item_id][1] += 1
    fp.close()
    
    for item_id in record:
        avg_score[item_id] = round(record[item_id][0] / record[item_id][1], 3)
    
    return avg_score

In [14]:
def get_item_cate(avg_score, input_file):
    '''
    倒排索引，item => 分类，分类 => item
    '''
    if not os.path.exists(input_file):
        return {}, {}
    
    fp = open(input_file, "r", encoding="utf-8")
    line_num = 0
    # item对应的类别
    item_cate = {}
    for line in fp:
        if line_num == 0:
            line_num += 1
            continue
        
        item = line.strip().split(",")
        if len(item) < 3:
            continue
            
        item_id = item[0]
        cate_str = item[-1]
        # 将多个cate切分开
        cate_list = cate_str.strip().split("|")
        # 平分每个类别的权重
        '''此处可以作优化，比如统计用户入口，给入口类别一个高的权重'''
        ratio = round(1 / len(cate_list), 3)
        
        if item_id not in item_cate:
            item_cate[item_id] = {}
        
        for fix_cate in cate_list:
            item_cate[item_id][fix_cate] = ratio
            
    fp.close()
    
    # 记录中间信息
    record = {}
    # 倒排
    cate_item_sort = {}
    topn = 100
    for item_id in item_cate:
        for cate in item_cate[item_id]:
            if cate not in record:
                record[cate] = {}
            
            # 获取用户对电影的平均评分，取不到的默认为0
            item_id_rating_score = avg_score.get(item_id, 0)
            record[cate][item_id] = item_id_rating_score
            
    for cate in record:
        if cate not in cate_item_sort:
            cate_item_sort[cate] = []
        
        # 取用户参与评分电影中平均评分前topn个
        for item in sorted(record[cate].items(), key=operator.itemgetter(1), reverse=True)[:topn]:
            cate_item_sort[cate].append(item[0])
    
    return item_cate, cate_item_sort

In [23]:
def get_time_max(input_file):
    '''获取最大时间戳，用于做时间衰减(因为数据集和当前时间差太大了，所以采用数据集中的最大时间来作时间衰减)'''
    if not os.path.exists(input_file):
        return 0
    line_num = 0
    latest = 0
    fp = open(input_file, "r", encoding="utf-8")
    for line in fp:
        if line_num == 0:
            line_num += 1
            continue
        
        item = line.strip().split(",")
        if len(item) < 4:
            continue
        
        timestamp = int(item[3])
        if timestamp > latest:
            latest = timestamp
    
    fp.close()
    return latest

In [5]:
# 获取平均打分
avg_score = get_avg_score(ratings_file)
avg_score

{'1': 3.921,
 '3': 3.26,
 '6': 3.946,
 '47': 3.975,
 '50': 4.238,
 '70': 3.509,
 '101': 3.783,
 '110': 4.032,
 '151': 3.545,
 '157': 2.864,
 '163': 3.561,
 '216': 3.327,
 '223': 3.856,
 '231': 3.06,
 '235': 3.679,
 '260': 4.231,
 '296': 4.197,
 '316': 3.375,
 '333': 3.78,
 '349': 3.605,
 '356': 4.164,
 '362': 3.529,
 '367': 3.185,
 '423': 2.85,
 '441': 3.929,
 '457': 3.992,
 '480': 3.75,
 '500': 3.389,
 '527': 4.225,
 '543': 3.317,
 '552': 3.262,
 '553': 3.815,
 '590': 3.835,
 '592': 3.429,
 '593': 4.161,
 '596': 3.45,
 '608': 4.116,
 '648': 3.537,
 '661': 3.449,
 '673': 2.708,
 '733': 3.64,
 '736': 3.321,
 '780': 3.446,
 '804': 3.25,
 '919': 3.88,
 '923': 4.043,
 '940': 4.0,
 '943': 3.929,
 '954': 4.083,
 '1009': 3.056,
 '1023': 3.615,
 '1024': 3.667,
 '1025': 3.58,
 '1029': 3.386,
 '1030': 2.767,
 '1031': 3.479,
 '1032': 3.375,
 '1042': 3.274,
 '1049': 3.476,
 '1060': 3.694,
 '1073': 3.874,
 '1080': 3.927,
 '1089': 4.202,
 '1090': 3.984,
 '1092': 3.191,
 '1097': 3.766,
 '1127': 3.476

In [15]:
item_cate, cate_item_sort = get_item_cate(avg_score, movies_file)

In [16]:
# item -> cate
item_cate

{'1': {'Adventure': 0.2,
  'Animation': 0.2,
  'Children': 0.2,
  'Comedy': 0.2,
  'Fantasy': 0.2},
 '2': {'Adventure': 0.333, 'Children': 0.333, 'Fantasy': 0.333},
 '3': {'Comedy': 0.5, 'Romance': 0.5},
 '4': {'Comedy': 0.333, 'Drama': 0.333, 'Romance': 0.333},
 '5': {'Comedy': 1.0},
 '6': {'Action': 0.333, 'Crime': 0.333, 'Thriller': 0.333},
 '7': {'Comedy': 0.5, 'Romance': 0.5},
 '8': {'Adventure': 0.5, 'Children': 0.5},
 '9': {'Action': 1.0},
 '10': {'Action': 0.333, 'Adventure': 0.333, 'Thriller': 0.333},
 '11': {'Comedy': 0.333, 'Drama': 0.333, 'Romance': 0.333},
 '12': {'Comedy': 0.5, 'Horror': 0.5},
 '13': {'Adventure': 0.333, 'Animation': 0.333, 'Children': 0.333},
 '14': {'Drama': 1.0},
 '15': {'Action': 0.333, 'Adventure': 0.333, 'Romance': 0.333},
 '16': {'Crime': 0.5, 'Drama': 0.5},
 '17': {'Drama': 0.5, 'Romance': 0.5},
 '18': {'Comedy': 1.0},
 '19': {'Comedy': 1.0},
 '20': {'Action': 0.2,
  'Comedy': 0.2,
  'Crime': 0.2,
  'Drama': 0.2,
  'Thriller': 0.2},
 '21': {'Comed

In [17]:
# cate -> item
cate_item_sort

{'(no genres listed)': ['147250',
  '171749',
  '176601',
  '166024',
  '172591',
  '156605',
  '159161',
  '169034',
  '171495',
  '173535',
  '181413',
  '142456',
  '149330',
  '159779',
  '167570',
  '172497',
  '122896',
  '171891',
  '181719',
  '141866',
  '114335',
  '155589',
  '132084',
  '134861',
  '141131',
  '161008',
  '165489',
  '174403',
  '143410',
  '152037',
  '182727',
  '171631',
  '122888',
  '129250'],
 'Action': ['876',
  '1631',
  '2196',
  '4180',
  '5244',
  '5490',
  '5746',
  '6835',
  '26169',
  '26401',
  '26840',
  '27704',
  '70451',
  '72142',
  '80124',
  '82744',
  '95149',
  '100906',
  '102084',
  '108795',
  '109241',
  '115727',
  '138632',
  '172637',
  '5915',
  '126430',
  '5833',
  '55167',
  '284',
  '1112',
  '3384',
  '4956',
  '5155',
  '5181',
  '5657',
  '5786',
  '6300',
  '7899',
  '8795',
  '26736',
  '26985',
  '27328',
  '27480',
  '27869',
  '32511',
  '34450',
  '46855',
  '57843',
  '58842',
  '71999',
  '72554',
  '86892',
  

In [21]:
item_cate["1"]

{'Adventure': 0.2,
 'Animation': 0.2,
 'Children': 0.2,
 'Comedy': 0.2,
 'Fantasy': 0.2}

In [22]:
cate_item_sort["Children"]

['3086',
 '85295',
 '91355',
 '91386',
 '95311',
 '118894',
 '121781',
 '124404',
 '126088',
 '126921',
 '131098',
 '136341',
 '136353',
 '136355',
 '136359',
 '136503',
 '136556',
 '146684',
 '150554',
 '156025',
 '163072',
 '166183',
 '170777',
 '172577',
 '172585',
 '172793',
 '173351',
 '175293',
 '175387',
 '175397',
 '74282',
 '59141',
 '3673',
 '26528',
 '27619',
 '42761',
 '76301',
 '85736',
 '92643',
 '95313',
 '105540',
 '108540',
 '111146',
 '135777',
 '172825',
 '182731',
 '72356',
 '110130',
 '26183',
 '178827',
 '95654',
 '3213',
 '78499',
 '83803',
 '1223',
 '72226',
 '6350',
 '26662',
 '60069',
 '2761',
 '2138',
 '1148',
 '953',
 '68954',
 '80',
 '238',
 '241',
 '1739',
 '2037',
 '3674',
 '4154',
 '4294',
 '5109',
 '5601',
 '6232',
 '6427',
 '7345',
 '26133',
 '27186',
 '31030',
 '36289',
 '56915',
 '65261',
 '73804',
 '80748',
 '85259',
 '92348',
 '95170',
 '107999',
 '117887',
 '121007',
 '126142',
 '128968',
 '130073',
 '130444',
 '134849',
 '140359',
 '148775',
 '16

In [24]:
# 产生评论的最大时间
get_time_max(ratings_file)

1537799250