In [48]:
import pandas as pd
import numpy as np
import re
import warnings
import random
warnings.filterwarnings("ignore")
from sklearn.metrics.pairwise import cosine_similarity
import surprise
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

### 데이터 전처리 과정

In [2]:
book_df=pd.read_csv('Books.csv',low_memory=False)
rating_df=pd.read_csv('Ratings.csv')
user_df=pd.read_csv('Users.csv')

In [61]:
book_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
rating_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
user_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
user_df['User-ID'].equals(rating_df['User-ID'])

False

In [6]:
book_df['ISBN'].equals(rating_df['ISBN'])

False

In [7]:
book_df.isnull().sum() 

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [8]:
book_df['Book-Author']= book_df['Book-Author'].fillna('Other')
book_df['Publisher']= book_df['Publisher'].fillna('Other')
book_df['Image-URL-L']= book_df['Image-URL-L'].fillna('no_image.jpg')

In [9]:
## 출판연도 항목 처리
book_df['Year-Of-Publication'].unique()

array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       

In [10]:
book_df.at[209538 ,'Publisher'] = 'DK Publishing Inc'
book_df.at[209538 ,'Year-Of-Publication'] = 2000
book_df.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
book_df.at[209538 ,'Book-Author'] = 'Michael Teitelbaum'

book_df.at[221678 ,'Publisher'] = 'DK Publishing Inc'
book_df.at[221678 ,'Year-Of-Publication'] = 2000
book_df.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic book_df Come to Life (Level 4: Proficient Readers)'
book_df.at[209538 ,'Book-Author'] = 'James Buckley'

book_df.at[220731 ,'Publisher'] = 'Gallimard'
book_df.at[220731 ,'Year-Of-Publication'] = '2003'
book_df.at[209538 ,'Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
book_df.at[209538 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [11]:
rating_df.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [12]:
dataset = pd.merge(book_df, rating_df, on='ISBN', how='inner')
dataset = pd.merge(dataset, user_df, on='User-ID', how='inner')
dataset.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Location,Age
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0,"stockton, california, usa",18.0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5,"timmins, ontario, canada",
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,8,0,"timmins, ontario, canada",
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,8,0,"timmins, ontario, canada",
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,8,0,"timmins, ontario, canada",


In [13]:
# 전처리된 데이터 항목들
real_rating_dataset = dataset[dataset['Book-Rating'] != 0]
real_rating_dataset = real_rating_dataset.reset_index(drop = True)

In [14]:
real_rating_dataset.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Location,Age
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5,"timmins, ontario, canada",
1,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,http://images.amazon.com/images/P/074322678X.0...,http://images.amazon.com/images/P/074322678X.0...,http://images.amazon.com/images/P/074322678X.0...,8,5,"timmins, ontario, canada",
2,0887841740,The Middle Stories,Sheila Heti,2004,House of Anansi Press,http://images.amazon.com/images/P/0887841740.0...,http://images.amazon.com/images/P/0887841740.0...,http://images.amazon.com/images/P/0887841740.0...,8,5,"timmins, ontario, canada",
3,1552041778,Jane Doe,R. J. Kaiser,1999,Mira Books,http://images.amazon.com/images/P/1552041778.0...,http://images.amazon.com/images/P/1552041778.0...,http://images.amazon.com/images/P/1552041778.0...,8,5,"timmins, ontario, canada",
4,1567407781,The Witchfinder (Amos Walker Mystery Series),Loren D. Estleman,1998,Brilliance Audio - Trade,http://images.amazon.com/images/P/1567407781.0...,http://images.amazon.com/images/P/1567407781.0...,http://images.amazon.com/images/P/1567407781.0...,8,6,"timmins, ontario, canada",


In [15]:
df = real_rating_dataset.copy()
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.drop(columns=["Year-Of-Publication","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
df.drop(index=df[df["Book-Rating"]==0].index,inplace=True)
df["Book-Title"]=df["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating,Location,Age
0,786868716,The Five People You Meet in Heaven,Mitch Albom,Hyperion,http://images.amazon.com/images/P/0786868716.0...,11400,9,"ottawa, ontario, canada",49.0
1,151008116,Life of Pi,Yann Martel,Harcourt,http://images.amazon.com/images/P/0151008116.0...,11400,6,"ottawa, ontario, canada",49.0
2,312195516,The Red Tent Bestselling Backlist,Anita Diamant,Picador USA,http://images.amazon.com/images/P/0312195516.0...,11400,7,"ottawa, ontario, canada",49.0
3,316789089,The Pilot s Wife A Novel Tag Author of the Wei...,Anita Shreve,"Little, Brown",http://images.amazon.com/images/P/0316789089.0...,11400,7,"ottawa, ontario, canada",49.0
4,743418174,Good in Bed,Jennifer Weiner,Washington Square Press,http://images.amazon.com/images/P/0743418174.0...,11400,8,"ottawa, ontario, canada",49.0


### svd을 이용한 점수 예측 프로그램 설정

In [24]:
reader=Reader(rating_scale=(1,10))

In [25]:
data=Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x1b13f6ee990>

In [26]:
svd = SVD(random_state=0)

In [27]:
cross_validate(svd, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6054  1.6070  1.6100  1.6097  1.6181  1.6101  0.0044  
MAE (testset)     1.2350  1.2355  1.2389  1.2408  1.2439  1.2388  0.0033  
Fit time          3.08    3.61    3.60    3.62    3.59    3.50    0.21    
Test time         0.39    0.50    0.39    0.52    0.51    0.46    0.06    


{'test_rmse': array([1.60540728, 1.60703836, 1.61004916, 1.60969462, 1.6180806 ]),
 'test_mae': array([1.2349519 , 1.23552141, 1.23892389, 1.24079895, 1.24394659]),
 'fit_time': (3.07885479927063,
  3.60905122756958,
  3.603370428085327,
  3.6196742057800293,
  3.5933680534362793),
 'test_time': (0.39058494567871094,
  0.49994540214538574,
  0.39058566093444824,
  0.5155622959136963,
  0.5111532211303711)}

In [28]:
# 전체 데이터 학습 시키기
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b13f6efc50>

In [29]:
# 원하는 유저의 평가 항목 검색
df[df['User-ID']==11400]

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating,Location,Age
0,786868716,The Five People You Meet in Heaven,Mitch Albom,Hyperion,http://images.amazon.com/images/P/0786868716.0...,11400,9,"ottawa, ontario, canada",49.0
1,151008116,Life of Pi,Yann Martel,Harcourt,http://images.amazon.com/images/P/0151008116.0...,11400,6,"ottawa, ontario, canada",49.0
2,312195516,The Red Tent Bestselling Backlist,Anita Diamant,Picador USA,http://images.amazon.com/images/P/0312195516.0...,11400,7,"ottawa, ontario, canada",49.0
3,316789089,The Pilot s Wife A Novel Tag Author of the Wei...,Anita Shreve,"Little, Brown",http://images.amazon.com/images/P/0316789089.0...,11400,7,"ottawa, ontario, canada",49.0
4,743418174,Good in Bed,Jennifer Weiner,Washington Square Press,http://images.amazon.com/images/P/0743418174.0...,11400,8,"ottawa, ontario, canada",49.0
5,60958022,Five Quarters of the Orange,Joanne Harris,Perennial,http://images.amazon.com/images/P/0060958022.0...,11400,10,"ottawa, ontario, canada",49.0
6,446606324,The Winner,David Baldacci,Warner Vision,http://images.amazon.com/images/P/0446606324.0...,11400,8,"ottawa, ontario, canada",49.0
7,449003981,A Patchwork Planet Ballantine Reader s Circle,Anne Tyler,Ballantine Books,http://images.amazon.com/images/P/0449003981.0...,11400,7,"ottawa, ontario, canada",49.0
8,440222656,The Horse Whisperer,Nicholas Evans,Dell,http://images.amazon.com/images/P/0440222656.0...,11400,6,"ottawa, ontario, canada",49.0
9,375506039,Open House,Elizabeth Berg,Random House Trade,http://images.amazon.com/images/P/0375506039.0...,11400,8,"ottawa, ontario, canada",49.0


In [62]:
svd.predict(11400,'0786868716')

TypeError: AlgoBase.predict() missing 1 required positional argument: 'iid'

In [31]:
df.loc[0, 'Book-Title']

'The Five People You Meet in Heaven'

### 다른 방식의 머신러닝 기법을 이용한 유저 기반 평가 프로그램


In [64]:
# 다른 방식을 위한 데이터 전처리
df2=real_rating_dataset.copy()
df2.dropna(inplace=True)
df2.reset_index(drop=True,inplace=True)
df2.drop(columns=["ISBN","Year-Of-Publication","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
df2.drop(index=df2[df2["Book-Rating"]==0].index,inplace=True)
df2["Book-Title"]=df2["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df2.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating,Location,Age
0,The Five People You Meet in Heaven,Mitch Albom,Hyperion,http://images.amazon.com/images/P/0786868716.0...,11400,9,"ottawa, ontario, canada",49.0
1,Life of Pi,Yann Martel,Harcourt,http://images.amazon.com/images/P/0151008116.0...,11400,6,"ottawa, ontario, canada",49.0
2,The Red Tent Bestselling Backlist,Anita Diamant,Picador USA,http://images.amazon.com/images/P/0312195516.0...,11400,7,"ottawa, ontario, canada",49.0
3,The Pilot s Wife A Novel Tag Author of the Wei...,Anita Shreve,"Little, Brown",http://images.amazon.com/images/P/0316789089.0...,11400,7,"ottawa, ontario, canada",49.0
4,Good in Bed,Jennifer Weiner,Washington Square Press,http://images.amazon.com/images/P/0743418174.0...,11400,8,"ottawa, ontario, canada",49.0


In [55]:
new_df=df2[df2['User-ID'].map(df2['User-ID'].value_counts()) > 200]  # Drop users who vote less than 200 times.
users_pivot=new_df.pivot_table(index=["User-ID"],columns=["Book-Title"],values="Book-Rating")
users_pivot.fillna(0,inplace=True)
new_df["User-ID"].values
l=new_df["User-ID"].values
idxa=random.randint(1,len(l))

In [56]:
def users_choice(id):
    
    users_fav=new_df[new_df["User-ID"]==id].sort_values(["Book-Rating"],ascending=False)[0:5]
    return users_fav

In [57]:
def user_based(new_df,id):
    if id not in new_df["User-ID"].values:
        print("❌ User NOT FOUND ❌")
        
        
    else:
        index=np.where(users_pivot.index==id)[0][0]
        similarity=cosine_similarity(users_pivot)
        similar_users=list(enumerate(similarity[index]))
        similar_users = sorted(similar_users,key = lambda x:x[1],reverse=True)[0:5]
    
        user_rec=[]
    
        for i in similar_users:
                data=df[df["User-ID"]==users_pivot.index[i[0]]]
                user_rec.extend(list(data.drop_duplicates("User-ID")["User-ID"].values))
        
    return user_rec

In [58]:
def common(new_df,user,user_id):
    x=new_df[new_df["User-ID"]==user_id]
    recommend_books=[]
    user=list(user)
    for i in user:
        y=new_df[(new_df["User-ID"]==i)]
        books=y.loc[~y["Book-Title"].isin(x["Book-Title"]),:]
        books=books.sort_values(["Book-Rating"],ascending=False)[0:5]
        recommend_books.extend(books["Book-Title"].values)
        
    return recommend_books[0:5]

In [71]:
user_id=l[idxa]
user_choice_df=pd.DataFrame(users_choice(user_id))
user_favorite=users_choice(user_id)
n=len(user_choice_df["Book-Title"].values)
print("USER: {} ".format(user_id))
print()
    
print("랜덤 선택된 유저가 선호하는 책")
print()

for i in range(n):
    b_name=new_df.loc[new_df["Book-Title"]==user_choice_df["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    b_url=new_df.loc[new_df["Book-Title"]==user_choice_df["Book-Title"].tolist()[i],"Image-URL-L"][:1].values[0]
    print(f'제목 {b_name}')
    print(f'표지: {b_url}\n')
    
print('\n\n')
user_based_rec=user_based(new_df,user_id)
books_for_user=common(new_df,user_based_rec,user_id)
books_for_userDF=pd.DataFrame(books_for_user,columns=["Book-Title"])

print("선호도 기반한 추천 책")
print()

for i in range(5):
    b_name=new_df.loc[new_df["Book-Title"]==books_for_userDF["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    b_url=new_df.loc[new_df["Book-Title"]==books_for_userDF["Book-Title"].tolist()[i],"Image-URL-L"][:1].values[0]
    print(f'제목 {b_name}')
    print(f'표지: {b_url}\n')


USER: 16634 

랜덤 선택된 유저가 선호하는 책

제목 VideoHound s Golden Movie Retriever 2001
표지: http://images.amazon.com/images/P/1578591201.01.LZZZZZZZ.jpg

제목 The Haunting of Hill House
표지: http://images.amazon.com/images/P/0140071083.01.LZZZZZZZ.jpg

제목 Shattered Glass
표지: http://images.amazon.com/images/P/0515100552.01.LZZZZZZZ.jpg

제목 The Crook Factory
표지: http://images.amazon.com/images/P/0380789175.01.LZZZZZZZ.jpg

제목 The Silmarillion
표지: http://images.amazon.com/images/P/0345325818.01.LZZZZZZZ.jpg




선호도 기반한 추천 책

제목 Horror The 100 Best Books
표지: http://images.amazon.com/images/P/0786705523.01.LZZZZZZZ.jpg

제목 The Year s Best Fantasy and Horror Year s Best Fantasy amp Horror Paperback
표지: http://images.amazon.com/images/P/0312111029.01.LZZZZZZZ.jpg

제목 Night Screams Twenty Two Stories of Terror
표지: http://images.amazon.com/images/P/0451455126.01.LZZZZZZZ.jpg

제목 The Shining Signet Book
표지: http://images.amazon.com/images/P/0451092163.01.LZZZZZZZ.jpg

제목 The Other Boleyn Girl
표지: http://image