In [1]:
import pandas as pd
import numpy as np

In [2]:
rnames = ['user_id','item_id','rating','timestamp']
rating = pd.read_table(r'ml-100k/u.data',names = rnames,engine = 'python')
rating = rating.set_index(['user_id','item_id'],drop=False).drop('timestamp',axis = 1)

In [3]:
rating

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,item_id,rating
user_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
196,242,196,242,3
186,302,186,302,3
22,377,22,377,1
244,51,244,51,2
166,346,166,346,1
...,...,...,...,...
880,476,880,476,3
716,204,716,204,5
276,1090,276,1090,1
13,225,13,225,2


## Matrix of Observed Rating

In [4]:
rating_matrix = rating.pivot(index = 'user_id',columns = 'item_id',values = 'rating')
rating_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


## Matrix of Users Features

In [5]:
unames = ['userid','age','gender','occupation','zip code']
users = pd.read_table(r'ml-100k/u.user',sep = '\|',names = unames, engine = 'python').drop('zip code', axis = 1)
users.head()

Unnamed: 0,userid,age,gender,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other


In [6]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
# encode
num_feat_1 = ['age']
num_transformer_1 = Pipeline(steps=[
        ('imp', SimpleImputer(strategy='constant', fill_value=-1)),
    ])

num_feat_2 = ['gender']
num_transformer_2 = Pipeline(steps=[
        ('binarize', OrdinalEncoder()),
    ])

cat_feat = ['occupation']
cat_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

preproc = ColumnTransformer(
transformers=[
            ('num_1', num_transformer_1, num_feat_1),
            ('num_2', num_transformer_2, num_feat_2),
            ('cat', cat_transformer, cat_feat)
        ])

pl = Pipeline(steps=[('preprocessor', preproc)])

In [8]:
# First column Age, Second column gender, The rest is onehot occupation
user_features_matrix = pl.fit_transform(users)
user_features_matrix

array([[24.,  1.,  0., ...,  0.,  1.,  0.],
       [53.,  0.,  0., ...,  0.,  0.,  0.],
       [23.,  1.,  0., ...,  0.,  0.,  1.],
       ...,
       [20.,  1.,  0., ...,  1.,  0.,  0.],
       [48.,  0.,  0., ...,  0.,  0.,  0.],
       [22.,  1.,  0., ...,  1.,  0.,  0.]])

In [9]:
user_occupation = user_features_matrix[:,2:]
user_occupation

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [25]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense,Flatten,Reshape
from tensorflow.keras.optimizers import SGD

X_train = user_occupation
 
## Encoder
encoder = Sequential()
encoder.add(Flatten(input_shape=[28,28]))
encoder.add(Dense(400,activation="relu"))
encoder.add(Dense(200,activation="relu"))
encoder.add(Dense(100,activation="relu"))
encoder.add(Dense(50,activation="relu"))
encoder.add(Dense(2,activation="relu"))
 
## Decoder
decoder = Sequential()
decoder.add(Dense(50,input_shape=[2],activation='relu'))
decoder.add(Dense(100,activation='relu'))
decoder.add(Dense(200,activation='relu'))
decoder.add(Dense(400,activation='relu'))
decoder.add(Dense(28 * 28, activation="relu"))
decoder.add(Reshape([28, 28]))
 
## Autoencoder
autoencoder = Sequential([encoder,decoder])
autoencoder.compile(loss="mse")
autoencoder.fit(X_train,X_train,epochs=50)
 
encoded_2dim = encoder.predict(X_train)
 
# AE = pd.DataFrame(encoded_2dim, columns = ['X1', 'X2'])
 
# AE['target'] = y_train
 
# sns.lmplot(x='X1', y='X2', data=AE, hue='target', fit_reg=False, size=10)

AttributeError: module 'tensorflow.python.training.experimental.mixed_precision' has no attribute 'register_loss_scale_wrapper'

In [11]:
encoded_2dim

NameError: name 'encoded_2dim' is not defined

## Matrix of Items Features

In [8]:
inames = ['itemid','movie title','release date','video release date','IMDB URL','unknown','Action','Adventure'
          ,'Animation','Children\'s','Comedy','Crime','Documentrary','Drama','Fatasy','Film-Noir','Horror','Musical'
          ,'Mystery','Romance','Sci-Fi','Thriller','War','Western']
items = pd.read_table(r'ml-100k/u.item',sep = '\|',names = inames,engine = 'python',encoding='latin-1')
items.head()

Unnamed: 0,itemid,movie title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fatasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
items_matrix = items[['itemid', 'unknown', 'Action', 'Adventure', 'Animation',
                     'Children\'s', 'Comedy', 'Crime', 'Documentrary', 'Drama',
                     'Fatasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                     'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]
items_matrix.head()

Unnamed: 0,itemid,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentrary,Drama,Fatasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
