In [1]:
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re
import time
import nltk

from PIL import Image
from io import BytesIO
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\multicampus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load data
df = pd.read_excel("test_to_excel.xlsx")
print('Total number of documents:', len(df))


Total number of documents: 9877


In [4]:
# Preview the first 5 rows
df.head()

Unnamed: 0,appid,name,short_description,price,categories,genres,recommendations,release_date,developers,metacritic,image
0,1780820,Chill Corner - Piano & Guitar (Music Album),This DLC added 21 new soundtracks with the pri...,110000.0,"['싱글 플레이어', '다운로드 가능한 콘텐츠', 'Steam 도전 과제', 'St...","['캐주얼', '무료', '인디', '시뮬레이션']",,2021년 12월 17일,['Low-Hi Tech'],,https://cdn.akamai.steamstatic.com/steam/apps/...
1,1780830,Hundred Days - Napa Valley,Explore the napa valley region and its grape v...,950000.0,"['싱글 플레이어', '다운로드 가능한 콘텐츠', 'Steam 도전 과제', '컨트...","['인디', '시뮬레이션', '전략']",,2021년 12월 21일,['Broken Arms Games'],,https://cdn.akamai.steamstatic.com/steam/apps/...
2,1780840,Loch Ness,Loch Ness is a 1-4 Player Co-Op Horror Hunter....,1450000.0,"['싱글 플레이어', '멀티플레이어', '협동', '온라인 협동', 'Steam 도...","['인디', '앞서 해보기']",,2021년 12월 3일,['JFi Games'],,https://cdn.akamai.steamstatic.com/steam/apps/...
3,1780850,Cthulhu Dungeon,《不可名状的地牢》是一款以克苏鲁的呼唤跑团为背景的 战棋+DBG+Roguelike游戏，通...,750000.0,['싱글 플레이어'],"['인디', '전략', '앞서 해보기']",,2022년 2월 18일,['SanYeGame'],,https://cdn.akamai.steamstatic.com/steam/apps/...
4,1780880,Queue Simulator,Queue simulator brings you your favorite part ...,110000.0,"['싱글 플레이어', 'Steam 도전 과제']","['캐주얼', '인디', '대규모 멀티플레이어', 'RPG', '시뮬레이션']",,2021년 12월 15일,['Just Making Games'],,https://cdn.akamai.steamstatic.com/steam/apps/...


In [5]:
# Data cleaning functions
def _removeNonAscii(s):
    return "".join(i for i in str(s) if  ord(i) < 128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text


# Data cleaning
df['cleaned'] = df['short_description'].apply(_removeNonAscii)
df['cleaned'] = df['cleaned'].apply(make_lower_case)
df['cleaned'] = df['cleaned'].apply(remove_stop_words)
df['cleaned'] = df['cleaned'].apply(remove_punctuation)
df['cleaned'] = df['cleaned'].apply(remove_html)

In [6]:
# Remove empty rows
df['cleaned'].replace('', np.nan, inplace=True)
df = df[df['cleaned'].notna()]
print('Total number of documents after cleaning:', len(df))

Total number of documents after cleaning: 9395


In [7]:
# Build Word2Vec model
corpus = [words.split() for words in df['cleaned']]
word2vec_model = Word2Vec(size=300, window=5, min_count=2, workers=-1) # make model
word2vec_model.build_vocab(corpus) # 
word2vec_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', lockf=1.0, binary=True) # pre-trained data
word2vec_model.train(corpus, total_examples=word2vec_model.corpus_count, epochs=15) 

(0, 0)

In [None]:
# Get document vectors
def get_document_vectors(document_list, model):
    document_embedding_list = []
    
    for line in document_list:
        doc2vec = None
        count = 0
        for word in line.split():
            if word in model.wv.vocab:
                count += 1
                if doc2vec is None:
                    doc2vec = model[word]
                else:
                    doc2vec = doc2vec + model[word]
        if doc2vec is not None:
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)
    
    return document_embedding_list

In [None]:
document_embedding_list = get_document_vectors(df['cleaned'], word2vec_model)
print('Number of document vectors:', len(document_embedding_list))
print(document_embedding_list)

In [None]:
# Calculate cosine similarity matrix
cosine_similarities = cosine_similarity(document_embedding_list, document_embedding_list)
print('Size of cosine similarity matrix:', cosine_similarities.shape)

In [None]:
def get_movie_indices(name, df, cosine_similarities):
    # Get the index of the movie based on its name
    indices = pd.Series(df.index, index=df['name']).drop_duplicates()
    idx = indices[name]

    # Get the indices of the most similar movies based on the cosine similarities
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]

    return movie_indices

def show_recommendations(name, df, cosine_similarities):
    movies = df[['name', 'image']]

    # Get the indices of the recommended movies
    movie_indices = get_movie_indices(name, df, cosine_similarities)

    # Get the recommended movies based on their indices
    recommend = movies.iloc[movie_indices].reset_index(drop=True)

    fig, axs = plt.subplots(1, 5, figsize=(20, 30))

    # Display the images of the recommended movies
    for index, row in recommend.iterrows():
        try:
            response = requests.get(row['image'])
            img = Image.open(BytesIO(response.content))
            axs[index].imshow(img)
            axs[index].set_title(row['name'])
        except:
            continue
    plt.show()

In [None]:
show_recommendations("Loch Ness", df, cosine_similarities)
