# Objective
- 게임설명이 비슷한 게임들을 추천하는 시스템 제작

### Data Load

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier


import tensorflow as tf
from sklearn.metrics.pairwise import linear_kernel
from sentence_transformers import SentenceTransformer




- Sentence-BERT 모델은 BERT를 기반으로 sentence embedding vector를 생성하는 모델.
	BERT 모델은 문장 내 모든 단어들을 참고한 문맥을 가진 임베딩 벡터를 만드는 모델이며 문장의 의미를 추출하는데 강점임.

In [3]:
# 라이브러리
'''

SentenceTransformer는 HuggingFace 라이브러리, 모델을 다운받아 사용하기 위한 라이브러리임.
linear_kernel는 여러 문장과 문장 사이의 코사인 유사도를 측정할 때 사용하는 라이브러리임.

'''



'\n\nSentenceTransformer는 HuggingFace 라이브러리, 모델을 다운받아 사용하기 위한 라이브러리임.\nlinear_kernel는 여러 문장과 문장 사이의 코사인 유사도를 측정할 때 사용하는 라이브러리임.\n\n'

In [4]:
file_path = 'D:\Code\DataSets/games.csv'

df = pd.read_csv(file_path)

print(df)

         AppID                                    Name  Release date  \
0        20200                        Galactic Bowling  Oct 21, 2008   
1       655370                            Train Bandit  Oct 12, 2017   
2      1732930                            Jolt Project  Nov 17, 2021   
3      1355720                                Henosis™  Jul 23, 2020   
4      1139950                   Two Weeks in Painland   Feb 3, 2020   
...        ...                                     ...           ...   
71711  2159120                                    SUR5   Apr 1, 2023   
71712  2125160                           Prison Life 2  Mar 30, 2023   
71713  2318860                  Architecture Zeitgeist  Mar 27, 2023   
71714  2180330  Girl's Way to Survive 2: Bloody Museum   Apr 2, 2023   
71715  2352890                            Hentai Ariel  Mar 30, 2023   

      Estimated owners  Peak CCU  Required age  Price  DLC count  \
0            0 - 20000         0             0  19.99          0   

### Pre-processing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71716 entries, 0 to 71715
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   AppID                       71716 non-null  int64  
 1   Name                        71715 non-null  object 
 2   Release date                71716 non-null  object 
 3   Estimated owners            71716 non-null  object 
 4   Peak CCU                    71716 non-null  int64  
 5   Required age                71716 non-null  int64  
 6   Price                       71716 non-null  float64
 7   DLC count                   71716 non-null  int64  
 8   About the game              69280 non-null  object 
 9   Supported languages         71716 non-null  object 
 10  Full audio languages        71716 non-null  object 
 11  Reviews                     9167 non-null   object 
 12  Header image                71716 non-null  object 
 13  Website                     350

In [6]:
data = df.loc[:,['Name','About the game']]

print(data)

                                         Name  \
0                            Galactic Bowling   
1                                Train Bandit   
2                                Jolt Project   
3                                    Henosis™   
4                       Two Weeks in Painland   
...                                       ...   
71711                                    SUR5   
71712                           Prison Life 2   
71713                  Architecture Zeitgeist   
71714  Girl's Way to Survive 2: Bloody Museum   
71715                            Hentai Ariel   

                                          About the game  
0      Galactic Bowling is an exaggerated and stylize...  
1      THE LAW!! Looks to be a showdown atop a train....  
2      Jolt Project: The army now has a new robotics ...  
3      HENOSIS™ is a mysterious 2D Platform Puzzler w...  
4      ABOUT THE GAME Play as a hacker who has arrang...  
...                                                  ... 

### 결측치 확인 및 제거

In [7]:
print(f'각 컬럼별 누락 삭제 전 데이터의 수 : \n{data.isnull().sum()}')

# 누락 데이터 존재.

print('*'*80)

cleaned_df = data.dropna()
# 제거할 게 많을 땐 컬럼지정해서 dropna(subset)

print(f'각 컬럼별 누락 삭제 후 데이터의 수 : \n{cleaned_df.isnull().sum()}')


각 컬럼별 누락 삭제 전 데이터의 수 : 
Name                 1
About the game    2436
dtype: int64
********************************************************************************
각 컬럼별 누락 삭제 후 데이터의 수 : 
Name              0
About the game    0
dtype: int64


### 결측치 제거 후 인덱스 재설정

In [8]:

print(cleaned_df)

cleaned_df.reset_index(drop=True, inplace=True)

print('*'*80)

print(cleaned_df)

                                         Name  \
0                            Galactic Bowling   
1                                Train Bandit   
2                                Jolt Project   
3                                    Henosis™   
4                       Two Weeks in Painland   
...                                       ...   
71711                                    SUR5   
71712                           Prison Life 2   
71713                  Architecture Zeitgeist   
71714  Girl's Way to Survive 2: Bloody Museum   
71715                            Hentai Ariel   

                                          About the game  
0      Galactic Bowling is an exaggerated and stylize...  
1      THE LAW!! Looks to be a showdown atop a train....  
2      Jolt Project: The army now has a new robotics ...  
3      HENOSIS™ is a mysterious 2D Platform Puzzler w...  
4      ABOUT THE GAME Play as a hacker who has arrang...  
...                                                  ... 

### 데이터 간소화 및 컬럼이름 정리
- 코사인 유사도 측정을 하는데에 있어서 문장들의 데이터가 너무 많으면
- 메모리 부족으로 오류가 발생함
- 컬럼의 띄어쓰기 된 공백을 _(underbar)로 처리

In [9]:
# 데이터 간소화
summary_df = cleaned_df.iloc[0:25000, :]

print(summary_df)

                                  Name  \
0                     Galactic Bowling   
1                         Train Bandit   
2                         Jolt Project   
3                             Henosis™   
4                Two Weeks in Painland   
...                                ...   
24995                   Hexen Hegemony   
24996  Bo's Adventure: Merging Islands   
24997                           5Rings   
24998     Disciples III: Reincarnation   
24999  Don't Be Afraid - The First Toy   

                                          About the game  
0      Galactic Bowling is an exaggerated and stylize...  
1      THE LAW!! Looks to be a showdown atop a train....  
2      Jolt Project: The army now has a new robotics ...  
3      HENOSIS™ is a mysterious 2D Platform Puzzler w...  
4      ABOUT THE GAME Play as a hacker who has arrang...  
...                                                  ...  
24995  The witches, who date back to the dark times o...  
24996  Bo's Adventure: 

In [10]:
# 컬럼 이름 정리

summary_df.rename(columns = {'About the game' : 'About_the_game'}, inplace = True)

print(summary_df)

                                  Name  \
0                     Galactic Bowling   
1                         Train Bandit   
2                         Jolt Project   
3                             Henosis™   
4                Two Weeks in Painland   
...                                ...   
24995                   Hexen Hegemony   
24996  Bo's Adventure: Merging Islands   
24997                           5Rings   
24998     Disciples III: Reincarnation   
24999  Don't Be Afraid - The First Toy   

                                          About_the_game  
0      Galactic Bowling is an exaggerated and stylize...  
1      THE LAW!! Looks to be a showdown atop a train....  
2      Jolt Project: The army now has a new robotics ...  
3      HENOSIS™ is a mysterious 2D Platform Puzzler w...  
4      ABOUT THE GAME Play as a hacker who has arrang...  
...                                                  ...  
24995  The witches, who date back to the dark times o...  
24996  Bo's Adventure: 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_df.rename(columns = {'About the game' : 'About_the_game'}, inplace = True)


## 영어 문장 유사도 모델

### Model Generation

In [11]:
# 영어 문장 SentenceBERT 모델 생성
model_name = 'sentence-transformers/all-MiniLM-L12-v2'
en_model = SentenceTransformer(model_name)

### Sentence Embedding Vector Generation
- sentences 변수에 summary_df.About_the_game.values 게임설명을 series가 아닌 values값으로 저장.
- 저장한 값을 영어 문장 모델을 이용해 문장 임베딩 벡터를 생성
- 

In [12]:
# 문장 임베딩 벡터 생성

sentences = summary_df.About_the_game.values

print(f'About_the_game 컬럼에서 추출한 값 확인 : \n{sentences}')

print('*'*80)

# model_encoder() 함수 사용
embeddings = en_model.encode(sentences)

# 결과 확인하기
print(f'생성된 문장 임베딩 행렬의 모양 : {embeddings.shape}')

print('*'*80)

print(f'문장 1개의 임베딩 벡터의 모양 : {embeddings[0].shape}')

About_the_game 컬럼에서 추출한 값 확인 : 
['Galactic Bowling is an exaggerated and stylized bowling game with an intergalactic twist. Players will engage in fast-paced single and multi-player competition while being submerged in a unique new universe filled with over-the-top humor, wild characters, unique levels, and addictive game play. The title is aimed at players of all ages and skill sets. Through accessible and intuitive controls and game-play, Galactic Bowling allows you to jump right into the action. A single-player campaign and online play allow you to work your way up the ranks of the Galactic Bowling League! Whether you have hours to play or only a few minutes, Galactic Bowling is a fast paced and entertaining experience that will leave you wanting more! Full Single-player story campaign including 11 Characters and Environments. 2 Single-player play modes including Regular and Battle Modes. Head to Head Online Multiplayer play Modes. Super Powers, Special Balls, and Whammies. Unlockab

In [13]:
# Saving Sentences embedding vector 

np.save('D:\Code\스팀게임 추천 시스템/Sentences_Embedding_Vectors.npy', embeddings)

##### 
- 문장 임베딩 벡터를 생성할 때 소요시간이 있어 재실행 했을 때 다시 반복하는 
- 시간을 방지하기 위해 np.save('저장경로.npy', embeddings) 문장 임베딩 벡터를 npy 확장명으로 저장

In [14]:
# 저장된 numpy 배열(임베딩 행렬) 불러오기

file_path = 'D:\Code\스팀게임 추천 시스템/Sentences_Embedding_Vectors.npy'

embeddings = np.load(file_path)

print(f'문장 임베딩 행렬의 모양 : {embeddings.shape}')

문장 임베딩 행렬의 모양 : (25000, 384)


## 코사인 유사도 측정

In [17]:
# linear_kernel() 함수 --> 게임 설명 각각에 대해서 코사인 유사도 측정
# 게임 설명 간의 코사인 유사도 계산 --> 유사도 행렬(자료형 : DataFrame) 생성
sim_matrix = linear_kernel(embeddings, embeddings)

print(f'유사도 측정의 결과 : \n{sim_matrix}')

print('*'*80)

print(f'유사도 측정의 결과 값의 모양 : \n{sim_matrix.shape}')

print('*'*80)

print(f'index[0]에 대한 유사도 측정의 결과 : \n{sim_matrix[:, 0]}')

MemoryError: Unable to allocate 2.33 GiB for an array with shape (25000, 25000) and data type float32