In [1]:
# 데이터 전처리를 수행하는 모듈
import pandas as pd
import os
from stopwords import stopwords_dict

os.chdir('D:\study\Python_JupyterNotebook\RecommendGame')


class DataPreprocessor:
    def __init__(self, csv_file_path):
        self.df = pd.read_csv(csv_file_path)
        
    def preprocess_data(self):
        self.df['title'] = self.df['title'].fillna('')
        columns_to_dropna = ['price', 'req_min', 'req_rec', 'features', 'review']
        for column in columns_to_dropna:
            self.df.dropna(subset=[column], inplace=True)

        self._replace_stopwords()

    def _replace_stopwords(self):
        def replace_specific_stopwords(text, replacement_dict):
            for stopword, replacement in replacement_dict.items():
                if stopword in text:
                    text = text.replace(stopword, replacement)
            return text

        self.df['review'] = self.df['review'].apply(
            lambda x: replace_specific_stopwords(x, stopwords_dict)
        )

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TextDataVectorizer:
    def vectorize_text_data(self, X_train, X_test):
        vectorizer = TfidfVectorizer()
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)
        return X_train_vectorized, X_test_vectorized, vectorizer

In [3]:
stopwords_dict = {'씨발': '욕설입니다', '좆': '욕설입니다', '시발': '욕설입니다', '좃': '욕설입니다', '새ㄲㅣ': '욕설입니다',
                  '병신': '욕설입니다', '지x': '욕설입니다', '개새끼': '욕설입니다', '좆병신': '욕설입니다','새끼': '욕설입니다',
                  '씨2발': '욕설입니다', '족같은': '욕설입니다', '애미뒤진': '욕설입니다', '엄마없음': '욕설입니다','애비': '욕설입니다', 'ㅅ ㅣ ㅂ ㅏ': '욕설입니다', '개좆같고': '욕설입니다', '좆같다': '욕설입니다', 'ㅅㅂ': '욕설입니다',
                  'ㅄ': '욕설입니다', 'ㅂㅅ': '욕설입니다', '개 애미': '욕설입니다','개시발': '욕설입니다', '씹새끼들아': '욕설입니다',
                  '새끼들': '욕설입니다', '개좆같은게임': '욕설입니다', '씹새끼들아': '욕설입니다','씨팔': '욕설입니다', '개 씨팔': '욕설입니다',
                  '씹창': '욕설입니다', '아가리': '욕설입니다','미친년': '욕설입니다', '미친년들아': '욕설입니다',
                  '야스': '성행위', '자지': '남성의 생식기','섹스': '성행위', '야스': '성행위', '야스씬': '성행위',
                  '보지': '여성의 생식기','자위': '자기위로', '딸딸이': '자기위로', '고추': '남성의 생식기',
                  '꼭지': '신체부위','강간': '성범죄', '알몸': '맨몸', '누드': '맨몸', '뷰지': '여성의 생식기',
                  '꼭띠': '신체부위', '뷰지': '여성의 생식기',
                  '짱개': '중국인유저', '짱깨': '중국인유저', '쪽바리': '일본인유저'}

In [5]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/e1/4c/4685ccfae9806f561de716e32549190c1f533dde5bcadaf83bdf23972cf0/lightgbm-4.3.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---- ----------------------------------- 0.1/1.3 MB 4.2 MB/s eta 0:00:01
   ----------------------- ---------------- 0.8/1.3 MB 10.0 MB/s eta 0:00:01
   ---------------------------------------  1.3/1.3 MB 10.6 MB/s eta 0:00:01
   ---------------------------------------- 1.3/1.3 MB 10.6 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0


In [6]:
#%% 게임 추천 로직을 담당하는 모듈
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from text_vectorizer import TextDataVectorizer
from data_preprocessor import DataPreprocessor
from stopwords import stopwords_dict
class RecommendationLogic:
    
    def __init__(self, csv_file_path):
        self.preprocessor = DataPreprocessor(csv_file_path)

    # LightGBM 모델을 학습
    def train_lgbm_model(self, X_train_vectorized, y_train):
        lgbm_model = LGBMClassifier()
        lgbm_model.fit(X_train_vectorized, y_train)
        return lgbm_model
    
    # 리뷰 데이터 불용어 대체
    def _replace_stopwords(self, text, replacement_dict):
        for stopword, replacement in replacement_dict.items():
            if stopword in text:
                text = text.replace(stopword, replacement)
        return text

    # 게임 추천 해주는 메서드
    def recommend_games(self, liked_genres, disliked_genres, model, vectorizer, num_recommendations, max_price, min_review_length, tried_games):
        
        
        genre_games = self.preprocessor.df.copy()
        
        # 좋아하는 장르에 해당하는 게임만 선택
        for liked_genre in liked_genres:
            genre_games = genre_games[genre_games['tag'].str.contains(liked_genre)]
            
        
        # 싫어하는 장르에 게임 제거
        if disliked_genres:
            for disliked_genre in disliked_genres:
                genre_games = genre_games[~genre_games['tag'].str.contains(disliked_genre)]
                     
                
        # 리뷰 길이가 일정 이상인 게임만 선택
        # 리뷰의 길이가 긴 경우 제대로된 리뷰일 가능성이 높다
        genre_games = genre_games[genre_games['review'].apply(len) >= min_review_length]

        # TF-IDF로 텍스트 데이터 벡터화
        genre_games_vectorized = vectorizer.transform(genre_games['review'])
        
        # 각 게임에 대한 예측확률 계산
        genre_games['predicted_probability'] = model.predict_proba(genre_games_vectorized)[:, 1]
        
        
        # 리뷰에 대한 불용어를 대체하는 작업
        genre_games['review'] = genre_games['review'].apply(
            lambda x: str(self._replace_stopwords(x, stopwords_dict))
        )
        
        # 예측 확률이 높은 순서로 정렬
        recommended_games = genre_games[genre_games['predicted_probability'] > 0.5].sort_values(by='predicted_probability', ascending=False)

        # 중복된 게임 제거
        recommended_games = recommended_games.drop_duplicates(subset=['title'])
        
        # 해본 게임 제거        
        if tried_games:
            recommended_games = recommended_games[~recommended_games['appID'].astype(str).isin(tried_games)]
            
            
                          
        # 다양성을 고려하여 상위 게임 선택
        selected_games = [] # 최종적으로 선택된 게임 저장
        seen_titles = set()
        for _, game in recommended_games.iterrows():
            title = game['title']
            # 고유한 게임인지 확인하고 선택
            if title not in seen_titles and game['predicted_probability'] > 0.5:
                # 가격 정보 추출
                prices = game['price']
                if not isinstance(prices, str):
                    # 가격 정보가 문자열이 아니면, 처리할 수 있는 방법을 선택
                    original_price = 'N/A'
                else:
                    # 할인된 가격을 제거하고 원래 가격만 남김
                    prices = game['price'].split('\n')
                    original_price = prices[-2].strip() if len(prices) > 1 else prices[0].strip()

                # 가격이 최대 가격 이하이고 리뷰 길이가 최소 길이 이상인 경우에만 추가
                if (original_price == 'Free' or max_price is None or original_price == 'N/A' or float(original_price.replace('₩', '').replace(',', '')) <= max_price) and len(game['review']) >= min_review_length:
                    game['price'] = original_price
                    selected_games.append(game)
                    seen_titles.add(title)

                # 원하는 추천 수에 도달하면 종료
                if len(selected_games) == num_recommendations:
                    break

        # 추천된 게임 목록에서 appID 반환
        recommendations = pd.DataFrame(selected_games)[['appID']]
        return recommendations


In [7]:
#%% main
from flask import Flask, request, jsonify
from sklearn.model_selection import train_test_split
from recommendation_logic import RecommendationLogic
from text_vectorizer import TextDataVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import os


os.chdir('D:\study\Python_JupyterNotebook\RecommendGame') # 크롤링 데이터 파일 있는 곳으로 변경



app = Flask(__name__)

# 전역으로 모델과 벡터라이저 초기화
csv_file_path = '스팀 크롤링 데이터(2024-01-13)전체.csv'
recommendation_logic = RecommendationLogic(csv_file_path)
text_vectorizer = TextDataVectorizer()

# 데이터 전처리
# price, req_min, req_rec, features, review 가 없으면 삭제
# 리뷰데이터에  불용어를 다른 단어로 대체 예시(시발 = 욕설입니다)
recommendation_logic.preprocessor.preprocess_data()

# 데이터를 훈련 세트와 테스트 세트로 나누는 작업
# 기계 학습 모델을 훈련시키고 성능을 평가하기 위해 나눔
X_train, X_test, y_train, y_test = train_test_split(
    recommendation_logic.preprocessor.df[['title', 'appID', 'price', 'tag', 'review']].astype(str),
    recommendation_logic.preprocessor.df['label'],
    test_size=0.2,
    random_state=42
)

# 훈련 데이터와 테스트 데이터에 대해 텍스트 데이터를 벡터화하는 과정
# 기계 학습에 사용할 수 있게 텍스트를 수치로 변환
X_train_vectorized, X_test_vectorized, vectorizer = text_vectorizer.vectorize_text_data(X_train['review'], X_test['review'])

# lgbm 모델 학습
lgbm_model = recommendation_logic.train_lgbm_model(X_train_vectorized, y_train)

# 스프링부트 에서 json데이터 받아와서 처리하는 부분
@app.route('/', methods=['POST'])
def get_recommendations():
    try:
        # json 데이터 받아오는 부분
        # force=True force=True를 사용하여 Content-Type을 강제로 JSON으로 처리
        request_data = request.get_json() 
        
        
        liked_genres = request_data.get('Want Category', [])    # 좋아하는 게임 장르
        disliked_genres = request_data.get('Hate Category', []) # 싫어하는 게임 장르
        tried_games = request_data.get('Try Game', [])          # 해본 게임

        
        # 게임 추천 기능 실행
        recommendations = recommendation_logic.recommend_games(
            liked_genres,     # 좋아하는 게임 장르
            disliked_genres,  # 싫어하는 게임 장르
            lgbm_model,       # lgbm모델 적용
            vectorizer,       # 벡터라이즈 적용
            10, 100000, 1000, # 게임추천수 10개, 최대가격 100000원, 리뷰길이 1000 이상
            tried_games       # 해본 게임
        )

        # 결과를 문자열 형식으로 응답
        result_text = ','.join(recommendations['appID'].astype(str))
        print("Recommendations:", result_text)
        return result_text
    
    # 오류 메시지 출력
    except Exception as e:
        error_message = {'error': str(e)}
        print("Error:", error_message)
        return jsonify(error_message)


[LightGBM] [Info] Number of positive: 7973, number of negative: 6555
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88790
[LightGBM] [Info] Number of data points in the train set: 14528, number of used features: 3755
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.548802 -> initscore=0.195833
[LightGBM] [Info] Start training from score 0.195833


In [8]:
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://211.208.84.145:5000
Press CTRL+C to quit
127.0.0.1 - - [11/Mar/2024 18:45:54] "POST / HTTP/1.1" 200 -


Recommendations: 1372810,518030,1551360,564230,518790,678950,495570,359550,2373390,1222680
