In [1]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size = .25, random_state = 0)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/seojinji/.surprise_data/ml-100k


In [3]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcb1db60fd0>

In [4]:
predictions = algo.test(testset)
print('prediction type : ', type(predictions), ' size : ', len(predictions))
print('prediction 최초 결과 5개 추출')
predictions[:5]

prediction type :  <class 'list'>  size :  25000
prediction 최초 결과 5개 추출


[Prediction(uid='120', iid='282', r_ui=4.0, est=3.4955257039779215, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.7352751677381817, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=4.083299500338758, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.6319928700265334, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.383887390461067, details={'was_impossible': False})]

In [6]:
[ (pred.uid, pred.iid, pred.est) for pred in predictions[:5]]

[('120', '282', 3.4955257039779215),
 ('882', '291', 3.7352751677381817),
 ('535', '507', 4.083299500338758),
 ('697', '244', 3.6319928700265334),
 ('751', '385', 3.383887390461067)]

In [7]:
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
print(pred)

user: 196        item: 302        r_ui = None   est = 4.15   {'was_impossible': False}


In [8]:
uid = str(120)
iid = str(282)
pred = algo.predict(uid, iid)
print(pred)

user: 120        item: 282        r_ui = None   est = 3.50   {'was_impossible': False}


In [9]:
accuracy.rmse(predictions)

RMSE: 0.9479


0.9478732596137074

In [11]:
import pandas as pd

ratings = pd.read_csv('./ml-latest-small/ratings.csv')
ratings.to_csv('./ml-latest-small/ratings_noh.csv', index=False, header = False)

In [12]:
from surprise import Reader

reader = Reader(line_format='user item rating timestamp', sep = ',', rating_scale = (0.5, 5))
data = Dataset.load_from_file('./ml-latest-small/ratings_noh.csv', reader = reader)

In [13]:
trainset, testset = train_test_split(data, test_size = .25, random_state = 0)
algo = SVD(n_factors = 50, random_state = 0)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8682


0.8681952927143516

In [15]:
import pandas as pd
from surprise import Reader, Dataset

ratings = pd.read_csv('./ml-latest-small/ratings.csv')
reader = Reader(rating_scale = (0.5, 5.0))

data = Dataset.load_from_df(ratings[['userId', 'movieId','rating']], reader)
trainset, testset = train_test_split(data, test_size = .25, random_state =0)

algo = SVD(n_factors = 50, random_state = 0)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8682


0.8681952927143516

In [16]:
from surprise.model_selection import cross_validate

ratings = pd.read_csv('./ml-latest-small/ratings.csv')
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId','rating']], reader)

algo = SVD(random_state = 0)
cross_validate(algo, data, measures = ['RMSE', 'MSE'], cv = 5, verbose = True)

Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8769  0.8757  0.8700  0.8712  0.8776  0.8743  0.0031  
MSE (testset)     0.7689  0.7669  0.7568  0.7590  0.7702  0.7644  0.0054  
Fit time          5.43    5.44    5.01    5.37    4.52    5.15    0.35    
Test time         0.21    0.25    0.41    0.42    0.15    0.29    0.11    


{'test_rmse': array([0.87689573, 0.87572299, 0.86996225, 0.87121738, 0.87761194]),
 'test_mse': array([0.76894612, 0.76689075, 0.75683432, 0.75901973, 0.77020271]),
 'fit_time': (5.42722749710083,
  5.440629959106445,
  5.005561113357544,
  5.372784852981567,
  4.520937204360962),
 'test_time': (0.2140960693359375,
  0.25002408027648926,
  0.40561938285827637,
  0.4234180450439453,
  0.14764618873596191)}

In [17]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs' : [20, 40, 60], 'n_factors' : [50, 100, 200]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mse'], cv = 3)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.877173041497454
{'n_epochs': 20, 'n_factors': 50}


In [18]:
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep = ',', rating_scale = (0.5, 5))
data_folds = DatasetAutoFolds('./ml-latest-small/ratings_noh.csv', reader = reader)

trainset = data_folds.build_full_trainset()

In [19]:
algo = SVD(n_epochs = 20, n_factors = 50, random_state = 0)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcb140546a0>

In [20]:
movies = pd.read_csv('./ml-latest-small/movies.csv')

movieIds = ratings[ratings['userId'] == 9]['movieId']
if movieIds[movieIds == 42].count() == 0:
    print('사용자 아이디 9는 영화 아이디 42의 평점 없음')
    
print(movies[movies['movieId'] == 42])

사용자 아이디 9는 영화 아이디 42의 평점 없음
    movieId                   title              genres
38       42  Dead Presidents (1995)  Action|Crime|Drama


In [21]:
uid = str(9)
iid = str(42)
pred = algo.predict(uid, iid, verbose = True)

user: 9          item: 42         r_ui = None   est = 3.13   {'was_impossible': False}


In [24]:
def get_unseen_surprise(ratings, movies, userId):
    seen_movies = ratings[ratings['userId'] == userId]['movieId'].tolist()
    total_movies = movies['movieId'].tolist()
    
    unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
    print('평점 매긴 영화 수 :', len(seen_movies), '추천 대상 영화 수 :', len(unseen_movies), '전체 영화 수 :', len(total_movies))
    
    return unseen_movies

unseen_movies = get_unseen_surprise(ratings, movies, 9)

평점 매긴 영화 수 : 46 추천 대상 영화 수 : 9696 전체 영화 수 : 9742


In [26]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [29]:
import numpy as np

def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n = 10):
    predictions = [algo.predict(str(userId), str(movieId)) for movieId in unseen_movies]
    
    def sortkey_est(pred):
        return pred.est
    
    predictions.sort(key=sortkey_est, reverse = True)
    top_predictions = predictions[:top_n]
    
    top_movie_ids = [int(pred.iid) for pred in top_predictions]
    top_movie_rating = [pred.est for pred in top_predictions]
    top_movie_titles = movies[movies.movieId.isin(top_movie_ids)]['title']
    top_movie_preds = [(ids, title, rating) for ids, title, rating in zip(top_movie_ids, top_movie_titles, top_movie_rating)]
    return top_movie_preds

unseen_movies = get_unseen_surprise(ratings, movies, 9)
top_movie_preds = recomm_movie_by_surprise(algo, 9, unseen_movies, top_n = 10)

print('### Top -10 추천 영화 리스트 ###')
for top_movie in top_movie_preds :
    print(top_movie[1], " : ", np.round(top_movie[2], 3))

평점 매긴 영화 수 : 46 추천 대상 영화 수 : 9696 전체 영화 수 : 9742
### Top -10 추천 영화 리스트 ###
Usual Suspects, The (1995)  :  4.306
Star Wars: Episode IV - A New Hope (1977)  :  4.282
Pulp Fiction (1994)  :  4.278
Silence of the Lambs, The (1991)  :  4.226
Godfather, The (1972)  :  4.192
Streetcar Named Desire, A (1951)  :  4.155
Star Wars: Episode V - The Empire Strikes Back (1980)  :  4.122
Star Wars: Episode VI - Return of the Jedi (1983)  :  4.108
Goodfellas (1990)  :  4.083
Glory (1989)  :  4.079


In [30]:
from nltk.corpus import wordnet as wn
import nltk
nltk.download('all')

term = 'present'

synsets = wn.synsets(term)
print('synsets() 반환 type :', type(synsets))
print('synsets() 반환 값 개수 :', len(synsets))
print('synsets() 반환 값 :', synsets)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nl

[nltk_data]    |   Package timit is already up-to-date!
[nltk_data]    | Downloading package toolbox to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package toolbox is already up-to-date!
[nltk_data]    | Downloading package treebank to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package treebank is already up-to-date!
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     /home/seojinji/nltk_data...
[nltk_data]    |   Package unicode_samples is already up

synsets() 반환 type : <class 'list'>
synsets() 반환 값 개수 : 18
synsets() 반환 값 : [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]
