## 整理数据，清理部分无用数据比如‘zip’,'timestamp'...

In [41]:
import pandas as pd
import numpy as np
#用户信息

unames = ['user_id', 'gender', 'age' , 'occupation' , 'zip']
users = pd.read_table('users.dat',sep= '::', header = None, names = unames,engine = 'python')
users=users.drop(columns=['zip'])
#评分
rnames = ['user_id', 'movie_id', 'rating','timestamp']
ratings = pd.read_table('ratings.dat',sep= '::', header = None, names = rnames,engine = 'python')
ratings=ratings.drop(columns=['timestamp'])
#电影信息
mnames = ['movie_id', 'title' , 'genres']
movies =  pd.read_table('movies.dat',sep= '::', header = None, names = mnames,engine = 'python')

data=pd.merge(users,ratings,on='user_id')
data=pd.merge(data,movies,on='movie_id')
data=data.sort_values(by=['user_id'],na_position='first')

# 去掉电影title信息用id表示
title=set(data['title'])
data=data.drop(columns=['title'])

In [34]:
#检查数据的输出
# print(users[:5])
# print(ratings[:5])
# print(movies[:5])
print(data[:5])

       user_id gender  age  occupation  movie_id  rating  \
0            1      F    1          10      1193       5   
28501        1      F    1          10        48       5   
13819        1      F    1          10       938       4   
51327        1      F    1          10      1207       4   
31152        1      F    1          10      1721       4   

                                     genres  
0                                     Drama  
28501  Animation|Children's|Musical|Romance  
13819                               Musical  
51327                                 Drama  
31152                         Drama|Romance  


## 处理每部电影的流派信息，生成one-hot编码

In [164]:
genre=['Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
def oneHotgenre(genre,genreList):
    res=np.zeros(len(genre))
    for gen in genreList:
        res[genre.index(gen)]=1
    return res

movie_genre={}
for sample in np.array(data):
    genreList=sample[6].split('|')
    if not sample[4] in movie_genre:
        movie_genre[sample[4]]=oneHotgenre(genre,genreList)

## 用户特征向量：['gender','age','occupation']

In [83]:
def userInfo(data):
    userInfo={}
    for sample in data:
        if sample[0] not in userInfo:
            userInfo[sample[0]]=[sample[1],sample[2],sample[3]]
    return userInfo
user_info=userInfo(np.array(data))

## 生成正样本

In [171]:
posiSample=np.array(data[['gender','age','occupation','movie_id']])
posiS=[]
for sample in posiSample:
    oneHot=np.array(movie_genre[sample[3]])
    posiS.append(np.append(sample[:3],oneHot))
posiSample=np.append(posiS,np.ones((len(posiSample),1)),axis=1)
np.random.shuffle(posiSample)
posiSample[:5]

array([['M', 18, 4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       ['F', 35, 17, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       ['M', 25, 2, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0,
        0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0],
       ['M', 18, 4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       ['M', 25, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]], dtype=object)

## 生成负样本
### record: 用户和有行为的电影的dict

In [172]:
record={}
for sample in np.array(data):
    if not sample[0] in record:
        record[sample[0]]=[sample[4]]
    else:
        record[sample[0]].append(sample[4])

### 为每位用户采集n个负样本，流行度高的电影抽取几率大

In [197]:
from random import choice
movie_pool=np.array(data['movie_id'])
def nSample(record,movie_pool,n):
    res={}
    for user in record:
        res[user]=[]
        while len(res[user])<n:
            mov=choice(movie_pool)
            if mov not in record[user]:
                res[user].append(mov)
    return res

userNomov=nSample(record,movie_pool,50)

In [207]:
negaSample=[]
for user in userNomov:
    for mov in userNomov[user]:
        negaSample.append(np.append(np.array(user_info[user],dtype=object),np.array(movie_genre[mov])))
negaSample=np.append(negaSample,-1*np.ones((len(negaSample),1)),axis=1)

negaSample[:5]

array([['F', 1, 10, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0],
       ['F', 1, 10, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, -1.0],
       ['F', 1, 10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, -1.0],
       ['F', 1, 10, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0],
       ['F', 1, 10, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]], dtype=object)

## 正负样本合并

In [208]:
# 使正负样平衡
np.random.shuffle(posiSample)
posiSample=posiSample[:6000*50]
sample=np.append(np.array(posiSample),np.array(negaSample),axis=0)
np.random.shuffle(sample)
sample[:5]

array([['M', 56, 13, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, -1.0],
       ['M', 45, 12, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, -1.0],
       ['M', 18, 4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       ['M', 50, 14, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0],
       ['M', 35, 16, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]], dtype=object)

## 划分数据集

In [209]:
train,test=sample[:600000],sample[600000:]
len(sample)

602000

## 将训练集放入dict，key=label

In [210]:
train_data={}
for sample in train:
    if not sample[-1] in train_data:
        train_data[sample[-1]]=[sample]
    else:
        train_data[sample[-1]].append(sample)
test_data={}
for sample in test:
    if not sample[-1] in test_data:
        test_data[sample[-1]]=[sample]
    else:
        test_data[sample[-1]].append(sample)

In [211]:
def Ent(data):
    """
    
    :param data: dict
    :return: 信息熵
    """
    num=sum(len(data[i]) for i in data)
    res=0
    for i in data:
        pi=len(data[i])/num
        res+=pi*np.log(pi)
    return -res

## 初始训练集的信息熵

In [212]:
E0=Ent(train_data)
E0

0.6931418231948226

## 根据某feature分割数据集

In [213]:
def split_dataframe(data, col):
    """
    根据第col列属性分割后的数据集
    :param data: dict
    :param col:
    :return: dict
    """
    res={}
    for types in data:
        for sample in data[types]:
            if not sample[col] in res:
                res[sample[col]]={}
                res[sample[col]][sample[-1]]=[sample]
            else:
                if not sample[-1] in res[sample[col]]:
                    res[sample[col]][sample[-1]]=[sample]
                else:
                    res[sample[col]][sample[-1]].append(sample)
    return res

## 分割前后的信息增益

In [214]:
def Gain(Class,subClass):
    """
    根据某属性分支后的信息增益
    :param Class: dict of lables 
    :param subClass: dict with multi-2d arrays
    :return:
    """
    e=Ent(Class)
    num=sum(len(Class[i]) for i in Class)
    temp=0
    for node in subClass:
        cur_data=subClass[node]
        e_sub=Ent(cur_data)
        num_node=sum(len(cur_data[i]) for i in cur_data)
        temp+=(num_node/num)*e_sub
    return e-temp

### e.g. 依据gender分割数据集的信息增益

In [215]:
split_gender=split_dataframe(train_data,0)
Gain_gender=Gain(train_data,split_gender)
Gain_gender

0.0008647831233985492

In [216]:
def choose_best_col(data, label):
    """

    :param data: dict with key: labels
    :param label: label list
    :return: the col of best feature and maxGain
    """
    best_feature, maxGain = -1, -1
    # 遍历每个feature col
    for feature in range(len(data[label[0]][0]) - 1):
        split_feature = split_dataframe(data, feature)
        Gain_feature = Gain(data, split_feature)
        if Gain_feature > maxGain:
            best_feature = feature
            maxGain = Gain_feature

    return best_feature, maxGain

In [217]:
label=[1,-1]
best_feature,maxGain=choose_best_col(train_data,label)
print('best feature is:',best_feature)
maxGain

best feature is: 1


0.0028413474837291064

In [239]:
class ID3Tree:

    def majorityVote(self,data):
        maxLabel,maxCount=-1,-1
        for i in data:
            if len(data[i])>maxCount:
                maxLabel=i
                maxCount=len(data[i])
        return maxLabel


    def buildTree(self,data,label,depth):
        # 如果当前组只有一种label，返回当前label
        if len(data)==1 or depth>3:
            return self.majorityVote(data)
        best_feature,maxGain=choose_best_col(data,label)
        split_data=split_dataframe(data,best_feature)
        Tree = {best_feature:{}}
        for sub in split_data:
            Tree[best_feature][sub]=self.buildTree(split_data[sub],label,depth+1)
        return Tree

In [240]:
os=ID3Tree()
Tree=os.buildTree(train_data,label,0)

In [241]:
import random
def predict(Tree,data_test,label):
    res=[]
    for sample in data_test:
        temp=Tree
        while True:
            feature=list(temp.keys())
            feature=feature[0]
            temp=temp[feature]
            if sample[feature] in temp:
                pred=temp[sample[feature]]
            else:
                # 如果预测sample某feature在树里缺失，则随机选一个分支
                keys=list(temp.keys())
                pred=temp[random.choice(keys)]
            if pred in label:
                res.append(pred)
                break
            else:
                temp=pred
    return res

In [242]:
# record: user:watced moives
# movie_pool
# movie_genre: movie_id:one-hot
# user_info: user_id:info
user=list(record.keys())
def recommend(userid,movie_pool,topN):
    res=[]
    while len(res)<topN:
        mov=choice(movie_pool)
        if not mov in res:
            vec=[np.append(np.array(user_info[userid],dtype=object),movie_genre[mov])]
            pred=predict(Tree,vec,label)
            if pred[0]==1:
                res.append(mov)
    return res

recommend_mov=recommend(1,movie_pool,20)
count=0
for mov in recommend_mov:
    if mov in record[1]:
        count+=1
print(100*count/20,'%')

20.0 %
