In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from wordcloud import WordCloud
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [5]:
file_path = './'
names = ['apple', 'disney', 'HBO', 'netflix', 'paramount', 'prime']

In [6]:
for name in names : 
    with open(file_path+name+"_titles.csv", 'rb') as f:
        df = pd.read_csv(f)
    globals()['{}_df'.format(name)] = df

In [7]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

n_gram_range = (2, 2)
stop_words = "english"

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [14]:
import string
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import os

file_names = ['apple_titles.csv', 'HBO_titles.csv', 'disney_titles.csv', 'netflix_titles.csv', 'paramount_titles.csv', 'prime_titles.csv']
output_directory = './wordclouds'  # 결과 저장

# 저장을 편하게 하려고 만든 디렉토리
os.makedirs(output_directory, exist_ok=True)

# 장르별로 6사 discription에 대한 워드 클라우드 생성
def generate_word_cloud(dataset_name, df):
    horror_df = df[df['genres'].str.contains('horror', case=False, na=False)]

    # western의 경우에 5사만 있어서 구분용으로 if문 생성, 혹시 모를 오타 에러에 대비해서 if 문 처리
    if horror_df.empty:
        print(f"No descriptions found for the horror genre in the {dataset_name} dataset.")
    else:
        # 문장부호는 제거함. discription에 있는 단어들을 토큰화 하여 tagged_word 에 집어넣음
        stop_words = set(stopwords.words('english'))
        punctuation = set(string.punctuation)
        sentences_tag = []
        for description in horror_df['description']:
            if pd.notnull(description):
                words = word_tokenize(description)
                words = [word.lower() for word in words if word.lower() not in punctuation]
                tagged_words = pos_tag(words)
                sentences_tag.extend(tagged_words)

        # 모든 discription에서 고질적으로 발생한 문제인 new, young의 출력을 없애기 위해 코드를 추가함, 역시나 마찬가지로 명사와 형용사에 안해서만 wordcloud 생성
        noun_adj_list = []
        for word, tag in sentences_tag:
            if tag.startswith('NN') or tag.startswith('JJ'):
                if word not in stop_words and word != 'new' and word != 'young':
                    noun_adj_list.append(word)

        # 빈도수 기준으로 출력하기 위해 빈도수 측정, 상위 40개의 단어들만 출력
        counts = FreqDist(noun_adj_list)
        tags = counts.most_common(40)

        # 워드 클라우드 만들기
        wc = WordCloud(font_path='./NanumGothicBold.otf', background_color="white", max_font_size=60)
        cloud = wc.generate_from_frequencies(dict(tags))

        # 시각화, 출력하는 그래프가 많아서 그래프 이름을 따로 적어줌
        plt.figure(figsize=(12, 10))
        plt.title(f"Word Cloud for {dataset_name} - Western Genre")
        plt.axis('off')
        plt.imshow(cloud)
        
        # wordcloud 저장
        output_file = os.path.join(output_directory, f"{dataset_name}_wordcloud_western.png")
        plt.savefig(output_file)
        plt.close()

# 총 6개 만들기
for file_name in file_names:
    dataset_name = file_name.split('.')[0]
    df = pd.read_csv(file_name)
    generate_word_cloud(dataset_name, df)

No descriptions found for the western genre in the apple_titles dataset.
