In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
!pip install numpy sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

def load_stopwords(filepath):
    """ 불용어 리스트를 로드합니다 """
    with open(filepath, 'r', encoding='utf-8') as file:
        stopwords = [line.strip() for line in file.readlines()]
    return set(stopwords)

def preprocess_text(text, stopwords):
    """ 텍스트에서 불용어를 제거합니다 """
    return ' '.join(word for word in text.split() if word not in stopwords)

def load_data(stopwords):
    """ 데이터를 로드하고 전처리하여 DataFrame을 반환합니다 """
    daily_plus_df = pd.read_csv('네이버웹툰_매일플러스_최종.csv')
    serialized_df = pd.read_csv('네이버웹툰_요일연재_최종.csv')
    completed_df = pd.read_csv('네이버웹툰_완결_최종.csv')
    combined_df = pd.concat([daily_plus_df, serialized_df, completed_df], ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=['title'])
    combined_df['content'] = combined_df['genre'] + ' ' + combined_df['keywords'] + ' ' + combined_df['synopsis']
    combined_df['content'] = combined_df['content'].apply(lambda x: preprocess_text(x, stopwords))
    return combined_df

def generate_embeddings(df):
    """ 문장 임베딩을 생성하고 저장"""
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    embeddings = model.encode(df['content'].tolist(), show_progress_bar=True)
    np.save('embeddings.npy', embeddings)
    return embeddings

def main_embedding():
    """ 메인 함수 """
    stopwords = load_stopwords('stopword.txt')
    df = load_data(stopwords)
    embeddings = generate_embeddings(df)
    print("Embeddings have been generated and saved.")

if __name__ == "__main__":
    main_embedding()


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Embeddings have been generated and saved.


In [None]:
import numpy as np

# Load the embeddings from the file
embeddings_path = 'embeddings.npy'
embeddings = np.load(embeddings_path)

# Display the shape of the embeddings array and a sample of the data
embeddings.shape, embeddings[:5]

((2501, 384),
 array([[ 0.03432035,  0.18982615,  0.05060954, ...,  0.10471968,
          0.07469204, -0.01833572],
        [ 0.13395727,  0.16760576, -0.14585848, ...,  0.07601216,
          0.07312716,  0.17817248],
        [ 0.14279455,  0.06038475,  0.04519264, ...,  0.07557794,
          0.05012708, -0.04446648],
        [ 0.04463439,  0.0131699 , -0.00488237, ..., -0.16130285,
         -0.04650046, -0.13270542],
        [-0.00096571,  0.05482993,  0.00159284, ..., -0.21356633,
          0.02937155, -0.10569926]], dtype=float32))

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_data_recommendation():
    """ 데이터를 로드하고 전처리하여 DataFrame을 반환합니다. """
    daily_plus_df = pd.read_csv('네이버웹툰_매일플러스_최종.csv')
    serialized_df = pd.read_csv('네이버웹툰_요일연재_최종.csv')
    completed_df = pd.read_csv('네이버웹툰_완결_최종.csv')
    combined_df = pd.concat([daily_plus_df, serialized_df, completed_df], ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=['title'])
    combined_df['content'] = combined_df['genre'] + ' ' + combined_df['keywords'] + ' ' + combined_df['synopsis']
    return combined_df

def load_embeddings():
    return np.load('embeddings.npy')


def recommend_webtoons(df, embeddings, selected_indices):
    """Generate recommendations for selected webtoons based on cosine similarity."""
    all_recommendations = pd.DataFrame()
    for index in selected_indices:
        selected_embedding = embeddings[df.index.get_loc(index)].reshape(1, -1)  # 인덱스 정확하게 찾기
        cosine_similarities = cosine_similarity(selected_embedding, embeddings).flatten()

        recommendations_df = pd.DataFrame({
            'recommended_for': df.loc[index]['title'],
            'title': df['title'],
            'similarity_score': cosine_similarities
        })

        # 선택된 웹툰 및 추천에서 자기 자신을 제외
        # 인덱스 기반 제
        recommendations_df = recommendations_df.sort_values(by='similarity_score', ascending=False)
        recommendations_df = recommendations_df[~recommendations_df.index.isin(selected_indices + [df.index.get_loc(index)])]
        recommendations_df = recommendations_df.head(3)  # 상위 3개 추천

        # 자기 자신 제거 추가 검증
        # 제목 기반 제거
        recommendations_df = recommendations_df[recommendations_df['title'] != df.loc[index]['title']]

        all_recommendations = pd.concat([all_recommendations, recommendations_df])

    return all_recommendations


def display_webtoons(df, num_to_display=30):
    """ 웹툰 목록을 무작위로 선택하여 사용자에게 보여줍니다. 이 함수는 웹툰의 원본 인덱스를 포함하여 출력합니다. """
    subset = df.sample(n=num_to_display, random_state=25).reset_index(drop=False)
    print("Randomly selected webtoons:\n")
    for idx, row in subset.iterrows():
        print(f"{idx + 1}: {row['title']} - {row['genre']} (Index: {row['index']})")
    return subset


# 원본 인덱스를 이용하여 추천 시스템을 실행
def main_recommendation():
    df = load_data_recommendation()
    embeddings = load_embeddings()
    subset = display_webtoons(df)

    print("\nEnter the numbers of your favorite webtoons (e.g., 1, 2, 5):")
    input_indices = [int(num) - 1 for num in input().split(',')]  # 사용자로부터 1~30 사이의 인덱스 입력 받음
    selected_indices = subset.iloc[input_indices]['index'].tolist()  # 원본 데이터프레임의 인덱스로 변환

    print("\nSelected Webtoons:")
    selected_df = df.loc[selected_indices]
    for idx, row in selected_df.iterrows():
        print(f"- {row['title']} ({row['genre']})")

    recommendations = recommend_webtoons(df, embeddings, selected_indices)

    print("\nTop Recommendations:")
    for idx, row in recommendations.iterrows():
        print(f"Recommended for '{row['recommended_for']}': {row['title']} (Score: {row['similarity_score']:.2f})")

if __name__ == "__main__":
    main_recommendation()



Randomly selected webtoons:

1: 진검승부 - 드라마 (Index: 1147)
2: 장난감 - 드라마 (Index: 1521)
3: 떨림 - 로맨스 (Index: 1876)
4: 내가 사랑한 물고기 - 로맨스 (Index: 208)
5: 군인RPG - 개그 (Index: 2212)
6: 북적북적 패밀리 - 일상 (Index: 2356)
7: 병의 맛 - 드라마 (Index: 2107)
8: 중매쟁이 아가 황녀님 - 로맨스 (Index: 1096)
9: 낙향문사전 - 무협/사극 (Index: 607)
10: 반드시 해피엔딩 - 로맨스 (Index: 1102)
11: 별을 품은 소드마스터 - 판타지 (Index: 527)
12: 미친 악마 - 로맨스 (Index: 1315)
13: 2015 우주특집 단편 - 판타지 (Index: 2425)
14: 홀리데이 - 드라마 (Index: 1091)
15: 미쳐 날뛰는 생활툰 - 일상 (Index: 2443)
16: 찬란하지 않아도 괜찮아 - 드라마 (Index: 2095)
17: 스트러글 - 액션 (Index: 1745)
18: 모기전쟁 - 판타지 (Index: 1577)
19: 로딩 - 드라마 (Index: 1907)
20: 공동급식구역 - 액션 (Index: 641)
21: 하우스키퍼 - 판타지 (Index: 727)
22: 히어로 더 맥시멈 - 액션 (Index: 590)
23: 버티면 10억 - 드라마 (Index: 103)
24: 외모지상주의 - 드라마 (Index: 597)
25: 피부과 만렙남 - 일상 (Index: 187)
26: 창궐 - 판타지 (Index: 2069)
27: [드라마원작] 아일랜드 2부 - 스릴러 (Index: 2182)
28: VS - 액션 (Index: 172)
29: 두 번째 딸로 태어났습니다 - 로맨스 (Index: 182)
30: 흔한 빙의물인 줄 알았다 - 로맨스 (Index: 526)

Enter the numbers of your favorite we