In [2]:
!ls

avazu_sample.txt   data_glancer.ipynb jaychou_lyrics.txt timemachine.txt
criteo_sample.txt  [1m[36miris[m[m               [1m[36mml-100k[m[m            [1m[36mtitanic[m[m


In [3]:
!cat ml-100k/README

SUMMARY & USAGE LICENSE

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.
 
This data set consists of:
	* 100,000 ratings (1-5) from 943 users on 1682 movies. 
	* Each user has rated at least 20 movies. 
        * Simple demographic info for the users (age, gender, occupation, zip)

The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th, 
1997 through April 22nd, 1998. This data has been cleaned up - users
who had less than 20 ratings or did not have complete demographic
information were removed from this data set. Detailed descriptions of
the data file can be found at the end of this file.

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any research
purposes under th

In [128]:
import pandas as pd
import numpy as np

In [129]:
data_df = pd.read_csv('./ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
data_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [130]:
data_df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [131]:
user_num = data_df.user_id.unique().shape[0]
item_num = data_df.item_id.unique().shape[0]
print(user_num, item_num)
print(f'sparsity: {1 - len(data_df)/(user_num*item_num)}')


943 1682
sparsity: 0.9369533063577546


In [32]:
def load_ml100k_data(f_path='./ml-100k/u.data'):
    data = pd.read_csv(f_path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    user_num = data_df.user_id.unique().shape[0]
    item_num = data_df.item_id.unique().shape[0]
    user_idx = []
    item_idx = []
    inter = np.zeros((user_num, item_num))
    for _, user_id, item_id, rating, timestamp in data_df.itertuples():
        user_idx.append(int(user_id-1))
        item_idx.append(int(item_id-1))
        inter[int(user_id-1), int(item_id-1)] = rating
    return user_idx, item_idx, inter
        

In [68]:
user_idx, item_idx, inter = load_ml100k_data()
inter.shape

(943, 1682)

In [132]:
col_names = ['user_id', 'age', 'gender', 'occupation', 'zipcode']
user_df = pd.read_csv('./ml-100k/u.user', sep='|', names=col_names)
user_df.head()

Unnamed: 0,user_id,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [134]:
genres = ['unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
          'Sci-Fi', 'Thriller', 'War', 'Western']
col_names = ['item_id', 'movie_title', 'release date', 'video_release_date', 'IMDb_URL', 
             'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 
             'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
             'Sci-Fi', 'Thriller', 'War', 'Western']
item_df = pd.read_csv('./ml-100k/u.item', sep='|', encoding = 'ISO-8859-1', names=col_names)
item_df.head()

Unnamed: 0,item_id,movie_title,release date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [50]:
# find encoding problems
f = open("./ml-100k/u.item","rb")#二进制格式读文件
i = 0
while True:
    i += 1 
    #print(i)
    line = f.readline()
    if not line:
        break
    else:
        try:
#             print(line)
#             print(line.decode('utf8'))
            line.decode('utf8')
            #为了暴露出错误，最好此处不print
        except:
            print(i, str(line))


543 b'543|Mis\xe9rables, Les (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Mis%E9rables%2C%20Les%20%281995%29|0|0|0|0|0|0|0|0|1|0|0|0|1|0|0|0|0|0|0\n'
1005 b'1005|Double vie de V\xe9ronique, La (Double Life of Veronique, The) (1991)|01-Jan-1991||http://us.imdb.com/M/title-exact?Podwojne%20zycie%20Weroniki%20(1991)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\n'
1104 b"1104|C'est arriv\xe9 pr\xe8s de chez vous (1992)|01-Jan-1992||http://us.imdb.com/M/title-exact?C%27est%20arriv%E9%20pr%E8s%20de%20chez%20vous%20%281992%29|0|0|0|0|0|1|1|0|1|0|0|0|0|0|0|0|0|0|0\n"
1233 b'1233|N\xe9nette et Boni (1996)|01-Jan-1996||http://us.imdb.com/Title?N%E9nette+et+Boni+(1996)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\n'
1252 b'1252|Contempt (M\xe9pris, Le) (1963)|27-Jun-1997||http://us.imdb.com/M/title-exact?M%E9pris%2C+Le+(1963)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0\n'
1322 b'1322|Metisse (Caf\xe9 au Lait) (1993)|01-Jan-1993||http://us.imdb.com/Title?M%E9tisse+(1993)|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0\

In [None]:
def load_ml100k_data_extra(f_path='./ml-100k/u.data'):
    col_names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(f_path, sep='\t', names=col_names)
    
    col_names = ['user_id', 'age', 'gender', 'occupation', 'zipcode']
    user_df = pd.read_csv('./ml-100k/u.user', sep='|', names=col_names)
    
    col_names = ['movie_id', 'movie_title', 'release date', 'video_release_date', 'IMDb_URL', 
             'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 
             'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
             'Sci-Fi', 'Thriller', 'War', 'Western']
    item_df = pd.read_csv('./ml-100k/u.item', sep='|', encoding = 'ISO-8859-1', names=col_names)

    user_num = data_df.user_id.unique().shape[0]
    item_num = data_df.item_id.unique().shape[0]
    user_idx = []
    item_idx = []
    inter = np.zeros((user_num, item_num))
    for _, user_id, item_id, rating, timestamp in data_df.itertuples():
        user_idx.append(int(user_id-1))
        item_idx.append(int(item_id-1))
        inter[int(user_id-1), int(item_id-1)] = rating
    return user_idx, item_idx, inter
        

### Make joinned data

In [159]:
join_df = data_df.merge(user_df, on='user_id', how='left').merge(item_df, on='item_id', how='left')
join_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zipcode,movie_title,release date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,0,L.A. Confidential (1997),01-Jan-1997,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,25,M,writer,40206,Heavyweights (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,28,M,technician,80525,Legends of the Fall (1994),01-Jan-1994,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,47,M,educator,55113,Jackie Brown (1997),01-Jan-1997,...,0,0,0,0,0,0,0,0,0,0


In [166]:
join_df.to_csv('ml-100k-joined.csv', index=None, encoding='utf8')

In [162]:
!head ../ml-100k-joined.csv

,user_id,item_id,rating,timestamp,age,gender,occupation,zipcode,movie_title,release date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,00000,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Confidential+(1997),0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,25,M,writer,40206,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%20(1994),0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,28,M,technician,80525,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%20the%20Fall%20(1994),0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,47,M,educator,55113,

In [165]:
!ls

avazu_sample.txt   data_glancer.ipynb jaychou_lyrics.txt timemachine.txt
criteo_sample.txt  [1m[36miris[m[m               [1m[36mml-100k[m[m            [1m[36mtitanic[m[m
