In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import multiprocessing

In [2]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# sets stopwords
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
nltk.download('punkt')

def tokenize_words(words):
  # tokenizes the cleaned word
  tokens = word_tokenize(words)

  # filters out stopwords
  return [token for token in tokens if token not in stopwords]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim.downloader

glove_vectors = gensim.downloader.load('glove-twitter-200')

In [6]:
categories = ["christmas", "halloween", "valentine", "celebration", "relaxing",
              "nature", "industrial", "sunshine", "sad", "happy", "summer",
              "winter", "sports", "playful", "energetic", "scary",
              "anger", "optimistic", "adventurous",
              "learning", "artistic", "science", "cozy", "colorful", "space"]

In [None]:
# finds words similar to each category
similar_words = [[word for word, _ in glove_vectors.most_similar(category, topn=50)] for category in categories]
similar_words_str = [", ".join(words) for words in similar_words]

similar_words_dict = {"category": categories,
                      "words": similar_words_str}

# creates a dataframe with categories and similar words
category_df = pd.DataFrame.from_dict(similar_words_dict)
pd.options.display.max_colwidth = 1000
category_df

Unnamed: 0,category,words
0,christmas,"xmas, merry, holiday, easter, holidays, valentines, day, year, valentine, thanksgiving, gift, gifts, happy, days, halloween, x-mas, birthday, sunday, everyone, friday, presents, festive, this, special, hope, family, great, present, coming, summer, wait, night, saturday, tomorrow, wonderful, tree, all, celebrate, weekend, my, wish, next, lovely, celebration, monday, going, good, our, winter, week"
1,halloween,"costume, christmas, party, costumes, easter, valentine, xmas, thanksgiving, valentines, holiday, hallowen, themed, claus, friday, weekend, holloween, outfit, saturday, horror, fun, scary, prom, spooky, zombie, homecoming, kids, coming, haunted, candy, disney, holidays, spring, winter, santa, outfits, cupcakes, ghost, haloween, tomorrow, apocalypse, grinch, movies, night, summer, surprise, decorations, wonderland, movie, day, pumpkin"
2,valentine,"valentines, christmas, vday, gift, special, day, happy, holiday, merry, easter, xmas, v-day, valentinesday, surprise, halloween, birthday, present, boyfriend, gifts, cake, thanksgiving, anniversary, weekend, wedding, holidays, celebrate, wish, date, sunday, bday, girlfriend, single, year, love, perfect, friends, friday, month, lucky, fun, sweet, wonderful, girl, lovely, flowers, monday, mom, friend, first, every"
3,celebration,"celebrations, celebrate, celebrating, anniversary, christmas, year, parade, sunday, event, party, celebrates, saturday, holiday, bday, xmas, ceremony, memorial, easter, th, thanksgiving, july, friday, victory, day, feast, annual, tonight, march, great, birthday, thursday, presents, extravaganza, celebrated, graduation, opening, june, evening, reception, coming, month, b-day, special, st, our, epic, tradition, first, festivities, last"
4,relaxing,"enjoying, relaxed, chilled, soothing, chilling, peaceful, relaxation, bath, laying, relax, afternoon, shower, productive, resting, chillin, calming, relaxin, listening, enjoyable, poolside, enjoy, rainy, sitting, vacation, refreshing, warm, staying, evening, comfy, bed, while, cosy, lush, nap, beach, wonderful, studying, sleeping, whilst, tub, lounging, fun, loveit, lovely, tanning, chill, nice, asleep, lazy, quiet"
5,nature,"photography, earth, landscape, image, art, culture, photo, wildlife, environment, unique, beautiful, loving, force, beauty, animals, sun, kind, view, trees, imagination, science, wild, but, simply, sometimes, society, natural, lake, moon, world, which, concept, place, often, ocean, outdoors, passion, and, garden, mind, change, gods, tree, essence, forest, beyond, images, does, flowers, creation"
6,industrial,"naval, sector, manufacturing, engineering, construction, civil, production, engineer, retail, solar, electrical, agriculture, systems, global, rural, regional, development, cultural, experimental, electronic, federal, hardware, technology, vertical, digital, agricultural, industry, central, architecture, general, design, market, producción, r&d, infrastructure, fleet, building, environmental, plant, aviation, equipment, software, oil, company, sales, mining, local, materials, metal, housing"
7,sunshine,"sun, rain, ☀, lovely, shining, sunny, luke, beautiful, shine, bright, lots, hope, skies, afternoon, much, summer, wonderful, weather, smile, rainy, darling, summertime, love, morning, enjoying, evening, warmth, sky, ilysm, hello, snow, sunset, nyou, ❄, happiness, hemmings, day, joy, ⛄, calum, babe, nplease, there, perfect, ☼, mean, beach, world, here, breeze"
8,sad,"depressing, really, depressed, sick, feel, cry, upset, bad, but, disappointed, kinda, how, tired, seriously, mad, so, why, much, when, crying, very, because, reason, sigh, weird, confused, not, think, too, especially, thing, feeling, anymore, know, hate, same, makes, n't, remember, people, mean, lonely, stupid, such, actually, that, this, being, though, ugh"
9,happy,"birthday, day, bday, wish, merry, love, year, you, hope, thank, enjoy, good, thanks, christmas, too, wishes, happybirthday, hbd, all, everyone, always, much, b-day, my, today, bless, morning, friend, celebrate, great, best, welcome, proud, years, and, have, well, dear, miss, beautiful, wonderful, very, lovely, th, night, days, valentine, sweet, brithday, thankyou"


In [7]:
antonyms = {
    "christmas": ["easter", "halloween", "valentines", "valentine", "thanksgiving", "summer"],
    "halloween": ["christmas", "easter", "valentine", "xmas", "valentinesday", "santa", "claus", "dissmas", "valentines", "thanksgiving", "claus", "grinch", "summer"],
    "valentine": ["christmas", "easter", "xmas", "halloween", "birthday", "thanksgiving", "bday"],
    "celebration" : [],
    "relaxing" : ["productive", "studying"],
    "nature" : ["science", "society"],
    "industrial" : ["rural", "cultural", "agricultural", "environmental", "plant"],
    "sunshine" : ["rain", "rainy", "snow", "mean"],
    "sad": ["happy", "glad", "laugh", "smiles", "laughs", "smile", "laughter", "joy"],
    "happy": ["sad", "unhappy"],
    "summer": ["spring", "winter", "december", "wintertime", "winters", "february", "snow", "january", "fall", "autumn"],
    "winter": ["spring", "summer", "summers", "summertime", "autumn", "warm", "fall", "july", "june"],
    "sports" : [],
    "playful" : ["aggressive", "freaky", "stubborn", "fussy", "shy"],
    "energetic" : ["uncomfortable", "sluggish", "edgy"],
    "scary" : ["funny"],
    "anger" : ["sadness", "fear", "confusion", "laughter"],
    "optimistic" : ["pessimistic", "anxious", "cynical", "skeptical", "negative", "concerned", "aggressive", "fearful", "realistic", "disappointed", "indifferent", "sluggish", "uneasy", "unhappy"],
    "adventurous" : [],
    "learning" : [],
    "artistic" : [],
    "science" : [],
    "cozy" : ["cold"],
    "colorful" : [],
    "space" : []
}

In [8]:
def clean_text(text):
  # removes contents in brackets and asterisks
  clean_text = re.sub(r"\[.*?\]", "", text)
  clean_text = re.sub(r"\*\*.*?\*\*", "", clean_text)

  # removes whitespaces before and after the string
  clean_text = clean_text.strip("\n")

  # replaced "\n" with a whitespace
  clean_text = clean_text.replace("\n", " ")

  # replace "(" and ")" with a whitespace
  clean_text = clean_text.replace("(", " ")
  clean_text = clean_text.replace(")", " ")

  # removes punctuations
  clean_text = re.sub(r"[^\w\s]", "", clean_text)

  # removes short words (for tokenization)
  clean_text = re.sub(r"\b\w{1,2}\b", "", clean_text)
  clean_text = re.sub(r"\s+", " ", clean_text).strip()

  # lowercase the text
  return clean_text.lower()

In [9]:
def clean_categories(text):
  # removes brackets
  clean_text = text.replace("[", "")
  clean_text = clean_text.replace("]", "")

  # removes whitespaces before and after the string
  clean_text = clean_text.strip("\n")

  # replaced "\n" with a whitespace
  clean_text = clean_text.replace("\n", " ")

  # replace "(" and ")" with a whitespace
  clean_text = clean_text.replace("(", " ")
  clean_text = clean_text.replace(")", " ")

  # removes punctuations
  clean_text = re.sub(r"[^\w\s]", "", clean_text)

  # removes short words (for tokenization)
  clean_text = re.sub(r"\b\w{1,2}\b", "", clean_text)
  clean_text = re.sub(r"\s+", " ", clean_text).strip()

  # lowercase the text
  return clean_text.lower()

In [10]:
def compute_score_cosine_multiprocess(row):
  row_result = [row[0]]
  # counts the number of word that are in filtered similar words
  for idx, category in enumerate(categories):
    tokens = row[idx + 1]
    score = 0
    count = 0
    for next_token in tokens:
      try:
        if next_token in antonyms[category]:
          score -= glove_vectors.similarity(category, next_token)
        else:
          score += glove_vectors.similarity(category, next_token)
        count += 1
      except:
        score = score
    if count != 0:
      row_result.append(score / count)
    else:
      row_result.append(0)

  return tuple(row_result)

In [11]:
def compute_score_cosine_multiprocess_one_col(row):
  row_result = [row[0]]
  # counts the number of word that are in filtered similar words
  for idx, category in enumerate(categories):
    tokens = row[1]
    score = 0
    count = 0

    for next_token in tokens:
      try:
        if next_token in antonyms[category]:
          score -= glove_vectors.similarity(category, next_token)
        else:
          score += glove_vectors.similarity(category, next_token)
        count += 1
      except:
        score = score
    if count != 0:
      row_result.append(score / count)
    else:
      row_result.append(0)

  return tuple(row_result)

In [12]:
def compute_score_cosine_multiprocess_two_col(row):
  row_result = [row[0]]
  # counts the number of word that are in filtered similar words
  for idx, category in enumerate(categories):
    tokens1 = row[1]
    tokens2 = row[2]
    score = 0
    count = 0

    for next_token in tokens1:
      try:
        if next_token in antonyms[category]:
          score -= glove_vectors.similarity(category, next_token)
        else:
          score += glove_vectors.similarity(category, next_token)
        count += 1
      except:
        score = score
    if count != 0:
      val = score / count
    else:
      val = 0

    for next_token in tokens2:
      try:
        if next_token in antonyms[category]:
          score -= glove_vectors.similarity(category, next_token)
        else:
          score += glove_vectors.similarity(category, next_token)
        count += 1
      except:
        score = score
    if count != 0:
      row_result.append(0.8 *(val) + 0.2*(score/count))
    else:
      row_result.append(0.8 *(val) + 0.2*(0))

  return tuple(row_result)

In [13]:
def compute_score_cosine(tokens, category):
  # counts the number of word that are in filtered similar words
  if len(tokens) == 0:
    return 0

  score = 0
  count = 0
  scores = {}
  for next_token in tokens:
    count += 1
    if next_token in scores:
      score += scores[next_token]
    else:
      try:
        tmp = glove_vectors.similarity(category, next_token)
        if next_token in antonyms[category]:
          tmp = tmp * -1
        score += tmp
        scores[next_token] = tmp
      except:
        score = score

  return score / count

# Part 1: Data Loading & Preprocessing

## Songs_df

We break the songs dataframe into multiple files because it is too big to run on colab by itself.

In [15]:
# loads all english songs
# song_df = pd.read_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt7.csv")

In [17]:
'''
song_df1 = song_df[:100000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt71.csv")

song_df2 = song_df[100000:200000]
song_df2.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt72.csv")

song_df3 = song_df[200000:300000]
song_df3.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt73.csv")

song_df4 = song_df[300000:]
song_df4.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt74.csv")

song_df5 = song_df[400000:]
song_df5.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt65.csv")


song_df1 = song_df[:500000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt1.csv")

song_df1 = song_df[500000:1000000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt2.csv")

song_df1 = song_df[1000000:1500000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt3.csv")

song_df1 = song_df[1500000:2000000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt4.csv")

song_df1 = song_df[2000000:2500000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt5.csv")

song_df1 = song_df[2500000:3000000]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt6.csv")

song_df1 = song_df[3000000:]
song_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt7.csv")
'''

'\nsong_df1 = song_df[:500000]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt1.csv")\n\nsong_df1 = song_df[500000:1000000]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt2.csv")\n\nsong_df1 = song_df[1000000:1500000]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt3.csv")\n\nsong_df1 = song_df[1500000:2000000]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt4.csv")\n\nsong_df1 = song_df[2000000:2500000]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt5.csv")\n\nsong_df1 = song_df[2500000:3000000]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt6.csv")\n\nsong_df1 = song_df[3000000:]\nsong_df1.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt7.

In [14]:
i = 1

for j in range(1,6):
  song_df = pd.read_csv(f"/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_pt{i}{j}.csv")

  # cleans lyrics
  song_clean_df = song_df[['id', 'lyrics']].copy()
  song_clean_df["lyrics"] = song_clean_df["lyrics"].apply(clean_text)
  print("Lyrics Cleaned")

  # creates tokenized description column
  song_clean_df["lyrics"] = song_clean_df["lyrics"].apply(tokenize_words)
  pd.reset_option("^display.", silent=True)
  print("Lyrics Tokenized")

  # Set up multiprocessing
  pool = multiprocessing.Pool()

  # Apply the function to each row in parallel
  result = pool.map(compute_score_cosine_multiprocess_one_col, song_clean_df.itertuples(name=None, index=False))

  # Convert the result back to a DataFrame
  result_df = pd.DataFrame(result, columns= ['id'] + categories)

  # Close the multiprocessing pool
  pool.close()
  print("Scores assigned")

  for idx, category in enumerate(categories):
    max_value = result_df[category].max()
    min_value = result_df[category].min()
    result_df[category] = round(((result_df[category] - min_value) / (max_value - min_value))*100, 2)

  print("min-max scaling done")

  result_df.to_csv(f"/content/drive/Shareddrives/5500 Database Project/datasets/songs/songs_english_scores_pt{i}{j}.csv")
  print(f"Exported File: songs_english_scores_pt{i}.csv")

Lyrics Cleaned
Lyrics Tokenized
Scores assigned
min-max scaling done
Exported File: songs_english_scores_pt1.csv
Lyrics Cleaned
Lyrics Tokenized


KeyboardInterrupt: 

## TV_df

In [None]:
# loads all tv series
tv_df = pd.read_csv("/content/drive/Shareddrives/5500 Database Project/datasets/tv_series.csv")

In [None]:
tv_df.head()

Unnamed: 0,Series Title,Release Year,Runtime,Genre,Rating,Cast,Synopsis
0,Wednesday,(2022– ),45 min,"Comedy, Crime, Fantasy",8.2,"Jenna Ortega, Hunter Doohan, Percy Hynes White...","Follows Wednesday Addams' years as a student, ..."
1,Yellowstone,(2018– ),60 min,"Drama, Western",8.7,"Kevin Costner, Luke Grimes, Kelly Reilly, Wes ...",A ranching family in Montana faces off against...
2,The White Lotus,(2021–2023),60 min,"Comedy, Drama",7.9,"Jennifer Coolidge, Jon Gries, F. Murray Abraha...","Set in a tropical resort, it follows the explo..."
3,1923,(2022–2023),60 min,"Drama, Western",8.6,"Harrison Ford, Helen Mirren, Brandon Sklenar, ...",The Duttons face a new set of challenges in th...
4,Jack Ryan,(2018– ),60 min,"Action, Drama, Thriller",8.0,"John Krasinski, Wendell Pierce, Michael Kelly,...","Up-and-coming CIA analyst, Jack Ryan, is thrus..."


In [None]:
tv_df.shape

(50000, 7)

In [None]:
# cleans synopsis
tv_clean_df = tv_df[['Series Title', 'Genre', 'Synopsis']].copy()
tv_clean_df["Synopsis"] = tv_clean_df["Synopsis"].apply(clean_text)
tv_clean_df["Genre"] = tv_clean_df["Genre"].apply(clean_text)

tv_clean_df

Unnamed: 0,Series Title,Genre,Synopsis
0,Wednesday,comedy crime fantasy,follows wednesday addams years student when sh...
1,Yellowstone,drama western,ranching family montana faces off against othe...
2,The White Lotus,comedy drama,set tropical resort follows the exploits vario...
3,1923,drama western,the duttons face new set challenges the early ...
4,Jack Ryan,action drama thriller,upandcoming cia analyst jack ryan thrust into ...
...,...,...,...
49995,Law & Order: Special Victims Unit,crime drama mystery,this series follows the special victims unit s...
49996,Doctor Who,adventure drama scifi,the further adventures time and space the alie...
49997,The Lord of the Rings: The Rings of Power,action adventure drama,epic drama set thousands years before the even...
49998,The Bear,comedy drama,young chef from the fine dining world returns ...


In [None]:
# creates tokenized columns
tv_clean_df["tokenized Synopsis"] = tv_clean_df["Synopsis"].apply(tokenize_words)
tv_clean_df["tokenized Genre"] = tv_clean_df["Genre"].apply(tokenize_words)

pd.reset_option("^display.", silent=True)
tv_clean_df.head()

Unnamed: 0,Series Title,Genre,Synopsis,tokenized Synopsis,tokenized Genre
0,Wednesday,comedy crime fantasy,follows wednesday addams years student when sh...,"[follows, wednesday, addams, years, student, a...","[comedy, crime, fantasy]"
1,Yellowstone,drama western,ranching family montana faces off against othe...,"[ranching, family, montana, faces, others, enc...","[drama, western]"
2,The White Lotus,comedy drama,set tropical resort follows the exploits vario...,"[set, tropical, resort, follows, exploits, var...","[comedy, drama]"
3,1923,drama western,the duttons face new set challenges the early ...,"[duttons, face, new, set, challenges, early, 2...","[drama, western]"
4,Jack Ryan,action drama thriller,upandcoming cia analyst jack ryan thrust into ...,"[upandcoming, cia, analyst, jack, ryan, thrust...","[action, drama, thriller]"


In [None]:
tv_clean_df = tv_clean_df[['Series Title', 'tokenized Synopsis', 'tokenized Genre']]
tv_clean_df

Unnamed: 0,Series Title,tokenized Synopsis,tokenized Genre
0,Wednesday,"[follows, wednesday, addams, years, student, a...","[comedy, crime, fantasy]"
1,Yellowstone,"[ranching, family, montana, faces, others, enc...","[drama, western]"
2,The White Lotus,"[set, tropical, resort, follows, exploits, var...","[comedy, drama]"
3,1923,"[duttons, face, new, set, challenges, early, 2...","[drama, western]"
4,Jack Ryan,"[upandcoming, cia, analyst, jack, ryan, thrust...","[action, drama, thriller]"
...,...,...,...
49995,Law & Order: Special Victims Unit,"[series, follows, special, victims, unit, spec...","[crime, drama, mystery]"
49996,Doctor Who,"[adventures, time, space, alien, adventurer, k...","[adventure, drama, scifi]"
49997,The Lord of the Rings: The Rings of Power,"[epic, drama, set, thousands, years, events, j...","[action, adventure, drama]"
49998,The Bear,"[young, chef, fine, dining, world, returns, ch...","[comedy, drama]"


In [None]:
# Set up multiprocessing
pool = multiprocessing.Pool()

# Apply the function to each row in parallel
result = pool.map(compute_score_cosine_multiprocess_two_col, tv_clean_df.itertuples(name=None, index=False))

# Convert the result back to a DataFrame
result_df = pd.DataFrame(result, columns= ['Series Title'] + categories)

# Close the multiprocessing pool
pool.close()

In [None]:
result_df

Unnamed: 0,Series Title,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,Wednesday,46.84,43.93,37.08,51.98,35.99,49.59,40.70,40.98,37.36,...,42.83,54.92,60.15,46.68,58.87,67.28,48.72,53.95,43.99,44.54
1,Yellowstone,47.15,51.55,35.60,50.00,43.16,52.94,40.12,51.55,42.49,...,47.02,45.73,51.85,41.75,44.65,48.29,34.76,56.61,44.71,43.41
2,The White Lotus,56.91,54.93,46.09,65.85,61.09,60.40,49.73,49.29,40.62,...,43.58,54.72,55.99,58.04,53.62,64.88,37.11,63.13,61.87,55.35
3,1923,68.64,64.90,55.84,82.98,58.54,71.78,72.21,57.77,54.32,...,56.35,68.82,67.46,48.79,68.11,66.54,55.92,64.18,65.56,62.29
4,Jack Ryan,42.13,36.05,36.00,49.84,36.84,43.95,59.41,38.96,42.20,...,50.73,53.33,55.24,42.55,54.08,54.81,49.37,44.82,31.25,54.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Law & Order: Special Victims Unit,56.43,57.37,45.06,60.58,39.46,54.91,65.32,39.19,50.09,...,56.34,59.35,48.17,49.50,54.99,59.58,50.86,56.87,50.08,58.15
49996,Doctor Who,58.62,65.02,48.97,57.49,58.57,72.00,55.57,58.17,49.07,...,63.50,54.92,63.34,68.36,58.37,67.22,53.35,68.27,58.15,92.37
49997,The Lord of the Rings: The Rings of Power,63.93,71.22,53.09,68.79,46.57,56.11,45.89,51.22,55.40,...,63.95,63.88,55.08,54.60,54.06,62.98,44.81,54.63,61.38,60.72
49998,The Bear,71.09,71.58,59.42,70.45,58.05,61.33,59.77,60.00,52.97,...,56.35,53.37,53.45,45.66,58.68,58.24,47.82,75.38,59.29,62.56


In [None]:
for idx, category in enumerate(categories):

  # creates new columns for each category and computes the score
  max_value = result_df[category].max()
  min_value = result_df[category].min()
  result_df[category] = round(((result_df[category] - min_value) / (max_value - min_value))*100, 2)

In [None]:
result_df

Unnamed: 0,Series Title,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,Wednesday,46.84,43.93,37.08,51.98,35.99,49.59,40.70,40.98,37.36,...,42.83,54.92,60.15,46.68,58.87,67.28,48.72,53.95,43.99,44.54
1,Yellowstone,47.15,51.55,35.60,50.00,43.16,52.94,40.12,51.55,42.49,...,47.02,45.73,51.85,41.75,44.65,48.29,34.76,56.61,44.71,43.41
2,The White Lotus,56.91,54.93,46.09,65.85,61.09,60.40,49.73,49.29,40.62,...,43.58,54.72,55.99,58.04,53.62,64.88,37.11,63.13,61.87,55.35
3,1923,68.64,64.90,55.84,82.98,58.54,71.78,72.21,57.77,54.32,...,56.35,68.82,67.46,48.79,68.11,66.54,55.92,64.18,65.56,62.29
4,Jack Ryan,42.13,36.05,36.00,49.84,36.84,43.95,59.41,38.96,42.20,...,50.73,53.33,55.24,42.55,54.08,54.81,49.37,44.82,31.25,54.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Law & Order: Special Victims Unit,56.43,57.37,45.06,60.58,39.46,54.91,65.32,39.19,50.09,...,56.34,59.35,48.17,49.50,54.99,59.58,50.86,56.87,50.08,58.15
49996,Doctor Who,58.62,65.02,48.97,57.49,58.57,72.00,55.57,58.17,49.07,...,63.50,54.92,63.34,68.36,58.37,67.22,53.35,68.27,58.15,92.37
49997,The Lord of the Rings: The Rings of Power,63.93,71.22,53.09,68.79,46.57,56.11,45.89,51.22,55.40,...,63.95,63.88,55.08,54.60,54.06,62.98,44.81,54.63,61.38,60.72
49998,The Bear,71.09,71.58,59.42,70.45,58.05,61.33,59.77,60.00,52.97,...,56.35,53.37,53.45,45.66,58.68,58.24,47.82,75.38,59.29,62.56


In [None]:
result_df.describe()

Unnamed: 0,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,happy,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,53.197308,51.102375,44.514015,57.902563,46.731957,57.770931,50.984531,48.589636,48.259129,42.88761,...,52.98529,61.003337,57.942546,49.128561,56.596788,63.34521,45.869623,57.395996,51.194594,56.698937
std,10.008273,11.587563,9.097928,8.971472,8.965069,9.79368,10.081151,9.707826,9.508457,8.802101,...,9.120336,9.132006,7.190005,8.436398,7.836317,8.522209,7.044308,9.367733,9.289959,10.788879
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,45.9,43.32,37.05,50.12,39.46,49.96,43.1,40.98,41.72,36.17,...,45.68,54.88,53.45,44.13,51.28,57.57,39.59,52.64,45.25,49.88
50%,52.82,49.98,44.56,58.02,46.57,58.13,51.38,50.02,47.78,43.08,...,52.74,61.345,58.06,48.79,55.97,62.98,46.0,57.46,49.88,57.32
75%,59.83,58.24,51.16,63.94,53.69,63.61,58.45,55.6825,54.91,49.62,...,58.38,66.62,62.1825,54.12,61.4,67.28,50.86,63.13,58.15,63.17
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [None]:
# Saves the new scores in a csv file
result_df.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/tv_series_scores.csv")

## Movies_df

In [None]:
# loads all movies. We lead movies_metadata and keywords as we are doing NLP on the movies' overviews and keywords
movies_df = pd.read_csv("/content/drive/Shareddrives/5500 Database Project/datasets/movies/movies_metadata.csv")

  movies_df = pd.read_csv("/content/drive/Shareddrives/5500 Database Project/datasets/movies/movies_metadata.csv")


In [None]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [None]:
movies_df.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [None]:
movies_df = movies_df[movies_df['id'].apply(lambda x: "-" not in x)]
movies_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [None]:
movies_df = movies_df.astype({"overview" : "string"})

In [None]:
# cleans synopsis
movies_clean_df = movies_df[['id', 'overview']].copy().dropna()
movies_clean_df["overview"] = movies_clean_df["overview"].apply(clean_text)

In [None]:
movies_clean_df.head()

Unnamed: 0,id,overview
0,862,led woody andys toys live happily his room unt...
1,8844,when siblings judy and peter discover enchante...
2,15602,family wedding reignites the ancient feud betw...
3,31357,cheated mistreated and stepped the women are h...
4,11862,just when george banks has recovered from his ...


In [None]:
# creates tokenized overview column
movies_clean_df["tokenized overview"] = movies_clean_df["overview"].apply(tokenize_words)

pd.reset_option("^display.", silent=True)
movies_clean_df.head()

Unnamed: 0,id,overview,tokenized overview
0,862,led woody andys toys live happily his room unt...,"[led, woody, andys, toys, live, happily, room,..."
1,8844,when siblings judy and peter discover enchante...,"[siblings, judy, peter, discover, enchanted, b..."
2,15602,family wedding reignites the ancient feud betw...,"[family, wedding, reignites, ancient, feud, ne..."
3,31357,cheated mistreated and stepped the women are h...,"[cheated, mistreated, stepped, women, holding,..."
4,11862,just when george banks has recovered from his ...,"[george, banks, recovered, daughters, wedding,..."


In [None]:
movies_clean_df = movies_clean_df[['id', 'tokenized overview']]

for idx, category in enumerate(categories):
  movies_clean_df[str(category)] = movies_clean_df['tokenized overview']

movies_clean_df = movies_clean_df.drop(columns = ['tokenized overview'])

movies_clean_df.head()

Unnamed: 0,id,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,862,"[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...",...,"[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,..."
1,8844,"[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...",...,"[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b..."
2,15602,"[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...",...,"[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne..."
3,31357,"[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...",...,"[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,..."
4,11862,"[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...",...,"[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,..."


In [None]:
movies_clean_df.head()

Unnamed: 0,id,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,862,"[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...",...,"[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,...","[led, woody, andys, toys, live, happily, room,..."
1,8844,"[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...",...,"[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b...","[siblings, judy, peter, discover, enchanted, b..."
2,15602,"[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...",...,"[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne...","[family, wedding, reignites, ancient, feud, ne..."
3,31357,"[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...",...,"[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,...","[cheated, mistreated, stepped, women, holding,..."
4,11862,"[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...",...,"[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,...","[george, banks, recovered, daughters, wedding,..."


In [None]:
# Set up multiprocessing
pool = multiprocessing.Pool()

# Apply the function to each row in parallel
result = pool.map(compute_score_cosine_multiprocess, movies_clean_df.itertuples(name=None, index=False))

# Convert the result back to a DataFrame
result_df = pd.DataFrame(result, columns=movies_clean_df.columns)

# Close the multiprocessing pool
pool.close()

In [None]:
result_df

Unnamed: 0,id,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,862,0.314036,0.243723,0.227234,0.263956,0.188129,0.235729,0.146556,0.209052,0.312449,...,0.274326,0.254786,0.188655,0.174596,0.305087,0.204853,0.244612,0.162607,0.191449,0.297191
1,8844,0.366865,0.281678,0.288238,0.296655,0.242690,0.285336,0.165981,0.256678,0.353044,...,0.357087,0.270303,0.190617,0.186222,0.344942,0.202537,0.307186,0.208504,0.225240,0.356042
2,15602,0.340226,0.262262,0.264154,0.277836,0.242795,0.257954,0.180138,0.238272,0.312298,...,0.310010,0.228834,0.176641,0.176968,0.298552,0.171401,0.269053,0.197423,0.197033,0.313791
3,31357,0.346733,0.237193,0.294634,0.261200,0.243419,0.270585,0.133883,0.266631,0.378205,...,0.334069,0.268935,0.236266,0.179665,0.314708,0.201392,0.274797,0.179321,0.187812,0.315843
4,11862,0.380813,0.288014,0.333921,0.316943,0.225635,0.265315,0.163688,0.265521,0.365237,...,0.326842,0.253788,0.169419,0.156457,0.329302,0.177140,0.276505,0.169968,0.180764,0.303260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44504,439050,0.349474,0.269290,0.297749,0.283620,0.242268,0.355242,0.210837,0.307890,0.410556,...,0.397247,0.329009,0.211716,0.196459,0.348085,0.200964,0.322511,0.164526,0.191990,0.375320
44505,111109,0.361379,0.273558,0.257611,0.315513,0.251808,0.289231,0.222746,0.233263,0.381719,...,0.360084,0.291631,0.221734,0.172876,0.364744,0.256059,0.333582,0.165930,0.217646,0.337518
44506,67758,0.383639,0.284131,0.306960,0.319363,0.201701,0.255371,0.196205,0.248353,0.356744,...,0.315153,0.274074,0.182883,0.174385,0.312932,0.186153,0.288673,0.167735,0.212144,0.367535
44507,227506,0.344343,0.231594,0.271956,0.273304,0.218902,0.277860,0.159441,0.259585,0.364387,...,0.316798,0.278251,0.209506,0.183726,0.326024,0.184401,0.290111,0.167003,0.187130,0.312649


In [None]:
for idx, category in enumerate(categories):

  # creates new columns for each category and computes the score
  max_value = result_df[category].max()
  min_value = result_df[category].min()
  result_df[category] = round(((result_df[category] - min_value) / (max_value - min_value))*100, 2)

In [None]:
result_df

Unnamed: 0,id,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,862,60.09,56.71,58.42,41.55,53.51,62.67,40.56,55.82,58.58,...,48.71,70.99,57.85,56.83,65.10,62.77,55.74,64.78,52.80,61.08
1,8844,67.25,63.13,67.50,45.33,64.43,72.03,44.29,64.44,64.38,...,59.76,73.58,58.29,60.02,72.08,62.23,66.45,76.57,59.41,70.81
2,15602,63.64,59.85,63.91,43.15,64.45,66.87,47.00,61.11,58.56,...,53.47,66.66,55.16,57.48,63.95,55.00,59.92,73.72,53.90,63.83
3,31357,64.52,55.61,68.45,41.23,64.57,69.25,38.13,66.24,67.98,...,56.68,73.35,68.54,58.22,66.78,61.96,60.91,69.07,52.09,64.17
4,11862,69.13,64.20,74.29,47.67,61.02,68.25,43.85,66.04,66.12,...,55.72,70.82,53.53,51.87,69.34,56.33,61.20,66.67,50.72,62.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44504,439050,64.89,61.03,68.91,43.82,64.34,85.22,52.90,73.71,72.60,...,65.12,83.37,63.03,62.82,72.63,61.86,69.08,65.27,52.91,74.00
44505,111109,66.50,61.76,62.94,47.50,66.25,72.76,55.18,60.20,68.48,...,60.16,77.13,65.28,56.36,75.55,74.65,70.98,65.63,57.92,67.75
44506,67758,69.52,63.54,70.28,47.95,56.23,66.38,50.09,62.93,64.91,...,54.16,74.20,56.56,56.78,66.47,58.43,63.28,66.09,56.85,72.71
44507,227506,64.20,54.67,65.07,42.63,59.67,70.62,43.03,64.97,66.00,...,54.38,74.90,62.53,59.33,68.77,58.02,63.53,65.91,51.96,63.64


In [None]:
result_df.describe()

Unnamed: 0,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,happy,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
count,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,...,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0,44509.0
mean,60.719395,55.294028,62.098691,42.28455,56.192618,68.0278,45.468709,58.912371,60.311229,58.72716,...,51.619037,69.536496,56.188447,55.075828,64.852352,59.943088,62.072852,63.258345,51.958566,62.127866
std,7.578712,7.450226,7.497501,4.815974,7.883396,7.545701,5.796663,8.177954,8.247249,8.605806,...,6.766672,6.443406,6.881973,7.075812,8.221635,6.872395,7.607178,7.581145,6.757765,7.866801
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,56.11,50.67,57.48,39.48,51.38,63.68,41.96,53.91,55.26,53.32,...,47.55,65.78,52.17,50.93,60.05,56.03,57.59,58.67,47.98,57.51
50%,60.72,55.32,62.13,42.37,56.31,68.31,45.33,59.05,60.36,58.7,...,51.67,69.77,56.43,55.15,65.02,60.01,62.22,63.24,52.03,62.46
75%,65.51,59.99,66.9,45.26,61.25,72.76,48.91,64.16,65.66,64.23,...,55.92,73.63,60.49,59.39,69.98,64.01,66.84,67.95,56.15,67.14
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [None]:
# Saves the new scores in a csv file
result_df.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/movies/movies_scores.csv")

## Games_df

In [None]:
# loads all games
games_df = pd.read_csv("/content/drive/Shareddrives/5500 Database Project/datasets/games/games.csv")

In [None]:
games_df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [None]:
games_df.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,"Galactic Bowling is an exaggerated and stylized bowling game with an intergalactic twist. Players will engage in fast-paced single and multi-player competition while being submerged in a unique new universe filled with over-the-top humor, wild characters, unique levels, and addictive game play. The title is aimed at players of all ages and skill sets. Through accessible and intuitive controls and game-play, Galactic Bowling allows you to jump right into the action. A single-player campaign and online play allow you to work your way up the ranks of the Galactic Bowling League! Whether you have hours to play or only a few minutes, Galactic Bowling is a fast paced and entertaining experience that will leave you wanting more! Full Single-player story campaign including 11 Characters and Environments. 2 Single-player play modes including Regular and Battle Modes. Head to Head Online Multiplayer play Modes. Super Powers, Special Balls, and Whammies. Unlockable Characters, Environments, a...",['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,Partial Controller Support","Casual,Indie,Sports","Indie,Casual,Sports,Bowling","https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005994.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005993.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005992.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000006011.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005685.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005686.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005995.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005688.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005689.1920x1080.jpg?t=1640121033,https://cdn.akamai.steamstatic.com/steam/apps/20200/0000005690.1920x1080.jpg?t=1640121033",http://cdn.akamai.steamstatic.com/steam/apps/256863704/movie_max.mp4?t=1638854607
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,"THE LAW!! Looks to be a showdown atop a train. This will be your last fight. Good luck, Train Bandit. WHAT IS THIS GAME? Train Bandit is a simple score attack game. The Law will attack you from both sides. Your weapon is your keyboard. You'll use those keys to kick the living shit out of the law. React quickly by attacking the correct direction. React...or you're dead. THE FEATURES Unlock new bandits Earn Achievements Become Steam's Most Wanted ? Battle elite officers Kick the law's ass","['English', 'French', 'Italian', 'German', 'Spanish - Spain', 'Japanese', 'Portuguese - Brazil', 'Russian', 'Simplified Chinese', 'Traditional Chinese']",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controller support,Steam Leaderboards,Remote Play on Phone,Remote Play on Tablet,Remote Play on TV","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Score Attack,Minimalist,Comedy,Singleplayer,Fast-Paced,Casual,Funny,Parody,Difficult,Gore,Violent,Western,Controller,Blood","https://cdn.akamai.steamstatic.com/steam/apps/655370/ss_16785eac54b29db688870e832f6997c89cd7804b.1920x1080.jpg?t=1617500526,https://cdn.akamai.steamstatic.com/steam/apps/655370/ss_599ad201ec3bb869449aaab4d0b103e220645762.1920x1080.jpg?t=1617500526,https://cdn.akamai.steamstatic.com/steam/apps/655370/ss_a26cfed1bc3e9ca4225fdc9d5c10681cfeb4f106.1920x1080.jpg?t=1617500526,https://cdn.akamai.steamstatic.com/steam/apps/655370/ss_9b815c0ec4996783088d82d22105caa1c044acad.1920x1080.jpg?t=1617500526,https://cdn.akamai.steamstatic.com/steam/apps/655370/ss_b5aada1bd265ff4b34116794fb937ad62e2882bb.1920x1080.jpg?t=1617500526",http://cdn.akamai.steamstatic.com/steam/apps/256691108/movie_max.mp4?t=1506089586
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,"Jolt Project: The army now has a new robotics project, jolt. It's up to you to control it and ensure the success of the missions! There are 9 stages of taking the breath away with the right difficulty and good gameplay. Plus an insane way of survival! Fire missiles at cars, tanks, helicopters and turrets! The fun is guaranteed! Use your mouse to aim and shoot and take out the enemies! In this game you will have to be aware of the various enemies who will do everything to destroy your charges and prevent the success of your mission! Cartoon-style graphics are optimized and fun and generate an excellent gaming environment!","['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,"https://cdn.akamai.steamstatic.com/steam/apps/1732930/ss_09d67dec0607be7c4ead80289763033a47c86d89.1920x1080.jpg?t=1637149386,https://cdn.akamai.steamstatic.com/steam/apps/1732930/ss_119713aa971021a5fe24ee241c4be9329d1a37ab.1920x1080.jpg?t=1637149386,https://cdn.akamai.steamstatic.com/steam/apps/1732930/ss_04221851fa82047cee95243edf20e413a35ff410.1920x1080.jpg?t=1637149386,https://cdn.akamai.steamstatic.com/steam/apps/1732930/ss_fb210f530ebe006066bfb5dff35d3c68f858f60e.1920x1080.jpg?t=1637149386,https://cdn.akamai.steamstatic.com/steam/apps/1732930/ss_f43db212224f5b33a0ef530a75c3f852d35e9342.1920x1080.jpg?t=1637149386,https://cdn.akamai.steamstatic.com/steam/apps/1732930/ss_eeb31f8499720fc1b20342f74f0040ca2f190881.1920x1080.jpg?t=1637149386","http://cdn.akamai.steamstatic.com/steam/apps/256847488/movie_max.mp4?t=1635980739,http://cdn.akamai.steamstatic.com/steam/apps/256847487/movie_max.mp4?t=1635980747"
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,"HENOSIS™ is a mysterious 2D Platform Puzzler where players are propelled into weird and visceral worlds as they take control of a small, droplet of water while overcoming obstacles and enemies throughout each level. The Player must venture through each world as it collects precious water tokens in order open the exit portal and restore vitality to its drought-ridden home world. Features: Traverse your way through 27 hand-crafted levels Unique player mechanics Battle menacing bosses across 3 distinct worlds* Original artwork &amp; animation Full controller support Localization support * Hidden world included!","['English', 'French', 'Italian', 'German', 'Spanish - Spain', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Simplified Chinese', 'Traditional Chinese']",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzzle,Survival,Adventure,Linear,Singleplayer,Experimental,Platformer,Precision Platformer,Puzzle-Platformer,2D,Stylized,Physics,Time Manipulation,Casual,Indie","https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_20bbae2d9d5aaa2f043f372a551faabc0c47b0b4.1920x1080.jpg?t=1639875115,https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_d471414fcb5b734a198533f7d068e1931b778546.1920x1080.jpg?t=1639875115,https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_64739e5a9a48355814505d85203f9fd6521564ca.1920x1080.jpg?t=1639875115,https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_5ecf89d6ac345779da09f7d42b232c371d500e90.1920x1080.jpg?t=1639875115,https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_ed65011a9294f333d9ad1dd3378251b0410e3f51.1920x1080.jpg?t=1639875115,https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_d2ae58e4a8a88446ab4bb3a1eea5f3ab412efd0e.1920x1080.jpg?t=1639875115,https://cdn.akamai.steamstatic.com/steam/apps/1355720/ss_9bfed05ab4ff0ab6e55318af8ebc0ecb10bfb7da.1920x1080.jpg?t=1639875115",http://cdn.akamai.steamstatic.com/steam/apps/256819153/movie_max.mp4?t=1611314333
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,"ABOUT THE GAME Play as a hacker who has arranged a deal with a gangster. That’s how the protagonist, Jack, is assigned a mission that should be accomplished in a specific timeframe, which he will find out soon enough. THE GAME’S FEATURES Spy on 4 senior managers within an organization to find out about their personalities. Manage the recruitment process in the organization to improve the work climate. Hack the candidates who want to get into the organization to make your job easier. Try to avoid having your physical health impacted negatively in the process. All of this while you enjoy an interesting story full of humor and action that evolves along with the game.","['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,Story Rich","https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_cb94604e43f910a3b994f120412bdc5a576222ad.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_8602d875f27d7966e718f917725ac990f70179d7.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_750a7f351da65eb0b5ef9724e5af211b2f6a0f5d.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_2660287cf061716e420e2752d54f7affcac87fce.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_3ba71b7d24b127ebe71de98c65774428bd5c6f2c.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_f582e3a210bfc9bff61063a5b145df07b0c030e0.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_8459843b2a4165863ff49b9cb7d9482069cc3911.1920x1080.jpg?t=1595003825,https://cdn.akamai.steamstatic.com/steam/apps/1139950/ss_4c6feb0ea8e5d1828f2586a67db9876743c06913.1920x1080.jpg?t=1595003...",http://cdn.akamai.steamstatic.com/steam/apps/256764430/movie_max.mp4?t=1580660973


In [None]:
games_df.shape

(85103, 39)

In [None]:
# cleans about the game
games_clean_df = games_df[['AppID', 'About the game', 'Genres']].copy().dropna()
games_clean_df["About the game"] = games_clean_df["About the game"].apply(clean_text)
games_clean_df["Genres"] = games_clean_df["Genres"].apply(lambda x: x.replace(',', ' '))

In [None]:
games_clean_df.head()

Unnamed: 0,AppID,About the game,Genres
0,20200,galactic bowling exaggerated and stylized bowl...,Casual Indie Sports
1,655370,the law looks showdown atop train this will yo...,Action Indie
2,1732930,jolt project the army now has new robotics pro...,Action Adventure Indie Strategy
3,1355720,henosis mysterious platform puzzler where play...,Adventure Casual Indie
4,1139950,about the game play hacker who has arranged de...,Adventure Indie


In [None]:
# creates tokenized description column
games_clean_df["tokenized description"] = games_clean_df["About the game"].apply(tokenize_words)
games_clean_df["tokenized genre"] = games_clean_df["Genres"].apply(tokenize_words)

pd.reset_option("^display.", silent=True)
games_clean_df.head()

Unnamed: 0,AppID,About the game,Genres,tokenized description,tokenized genre
0,20200,galactic bowling exaggerated and stylized bowl...,Casual Indie Sports,"[galactic, bowling, exaggerated, stylized, bow...","[Casual, Indie, Sports]"
1,655370,the law looks showdown atop train this will yo...,Action Indie,"[law, looks, showdown, atop, train, last, figh...","[Action, Indie]"
2,1732930,jolt project the army now has new robotics pro...,Action Adventure Indie Strategy,"[jolt, project, army, new, robotics, project, ...","[Action, Adventure, Indie, Strategy]"
3,1355720,henosis mysterious platform puzzler where play...,Adventure Casual Indie,"[henosis, mysterious, platform, puzzler, playe...","[Adventure, Casual, Indie]"
4,1139950,about the game play hacker who has arranged de...,Adventure Indie,"[game, play, hacker, arranged, deal, gangster,...","[Adventure, Indie]"


In [None]:
for idx, category in enumerate(categories):
  print(f"computing scores for '{category}' category...")

  # creates new columns for each category and computes the score
  games_clean_df[category] = games_clean_df.apply(lambda x: 0.8 * compute_score_cosine(x["tokenized description"], category) + 0.2 * compute_score_cosine(x["tokenized genre"], category), axis=1)
  max_value = games_clean_df[category].max()
  min_value = games_clean_df[category].min()
  games_clean_df[category] = round(((games_clean_df[category] - min_value) / (max_value - min_value))*100, 2)

computing scores for 'christmas' category...
computing scores for 'halloween' category...
computing scores for 'valentine' category...
computing scores for 'celebration' category...
computing scores for 'relaxing' category...
computing scores for 'nature' category...
computing scores for 'industrial' category...
computing scores for 'sunshine' category...
computing scores for 'sad' category...
computing scores for 'happy' category...
computing scores for 'summer' category...
computing scores for 'winter' category...
computing scores for 'sports' category...
computing scores for 'playful' category...
computing scores for 'energetic' category...
computing scores for 'scary' category...
computing scores for 'anger' category...
computing scores for 'optimistic' category...
computing scores for 'adventurous' category...
computing scores for 'learning' category...
computing scores for 'artistic' category...
computing scores for 'science' category...
computing scores for 'cozy' category...
co

In [None]:
games_clean_df.head()

Unnamed: 0,AppID,About the game,Genres,tokenized description,tokenized genre,christmas,halloween,valentine,celebration,relaxing,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,20200,galactic bowling exaggerated and stylized bowl...,Casual Indie Sports,"[galactic, bowling, exaggerated, stylized, bow...","[Casual, Indie, Sports]",61.13,62.4,60.64,67.54,55.55,...,60.3,65.43,68.02,62.22,58.76,68.39,60.52,57.76,67.29,69.54
1,655370,the law looks showdown atop train this will yo...,Action Indie,"[law, looks, showdown, atop, train, last, figh...","[Action, Indie]",66.52,66.59,66.52,72.01,55.37,...,68.36,69.82,69.08,59.26,60.34,63.41,66.59,59.82,62.33,72.63
2,1732930,jolt project the army now has new robotics pro...,Action Adventure Indie Strategy,"[jolt, project, army, new, robotics, project, ...","[Action, Adventure, Indie, Strategy]",64.17,62.09,63.39,69.23,54.91,...,64.21,71.37,75.21,61.17,63.91,70.09,67.21,59.82,68.61,73.65
3,1355720,henosis mysterious platform puzzler where play...,Adventure Casual Indie,"[henosis, mysterious, platform, puzzler, playe...","[Adventure, Casual, Indie]",58.58,54.21,56.08,64.68,50.9,...,55.71,63.73,72.37,58.75,57.39,71.21,61.47,57.44,69.48,68.64
4,1139950,about the game play hacker who has arranged de...,Adventure Indie,"[game, play, hacker, arranged, deal, gangster,...","[Adventure, Indie]",67.21,64.39,68.47,69.65,58.79,...,67.65,73.68,83.87,64.63,68.48,74.26,72.54,60.75,66.93,73.37


In [None]:
games_clean_df.describe()

Unnamed: 0,AppID,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
count,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,...,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0,81445.0
mean,1325016.0,63.661012,62.556362,64.000088,66.608447,55.054611,72.004195,45.486024,64.791177,63.850876,...,62.236703,65.15935,70.137164,58.454042,59.186374,65.887207,63.207591,59.450712,67.737387,69.930187
std,687786.6,8.600137,8.682789,9.297461,8.69514,7.85653,9.692628,4.886846,9.600067,8.641739,...,8.758329,9.260493,9.088676,7.023565,7.922288,8.350104,8.158793,8.009696,8.696731,8.3868
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,752030.0,60.48,59.03,59.98,64.0,51.87,69.22,43.88,60.76,60.19,...,58.88,62.27,67.21,56.33,56.93,63.65,60.82,56.46,64.85,67.6
50%,1278760.0,64.34,63.2,64.57,67.49,55.49,73.04,45.87,65.45,64.36,...,62.86,66.09,71.11,59.19,60.09,66.71,64.08,59.9,68.28,70.87
75%,1875720.0,68.22,67.39,69.26,70.9,59.21,76.75,47.89,70.13,68.64,...,66.91,69.88,74.87,61.98,63.09,69.76,67.24,63.5,71.88,74.01
max,2749500.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [None]:
games_final_df = games_clean_df.copy()
games_final_df = games_final_df.drop(columns = ['About the game', 'Genres', 'tokenized description', 'tokenized genre'])

games_final_df.head()

Unnamed: 0,AppID,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,20200,61.13,62.4,60.64,67.54,55.55,69.39,45.43,56.32,60.24,...,60.3,65.43,68.02,62.22,58.76,68.39,60.52,57.76,67.29,69.54
1,655370,66.52,66.59,66.52,72.01,55.37,72.45,45.16,64.57,67.96,...,68.36,69.82,69.08,59.26,60.34,63.41,66.59,59.82,62.33,72.63
2,1732930,64.17,62.09,63.39,69.23,54.91,73.87,49.42,65.71,63.12,...,64.21,71.37,75.21,61.17,63.91,70.09,67.21,59.82,68.61,73.65
3,1355720,58.58,54.21,56.08,64.68,50.9,71.36,47.47,62.53,59.32,...,55.71,63.73,72.37,58.75,57.39,71.21,61.47,57.44,69.48,68.64
4,1139950,67.21,64.39,68.47,69.65,58.79,75.32,49.56,65.29,69.73,...,67.65,73.68,83.87,64.63,68.48,74.26,72.54,60.75,66.93,73.37


In [None]:
# Saves the new scores in a csv file
games_final_df.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/games/games_scores.csv")

## Books_df

In [None]:
# loads all games
books_df = pd.read_csv("/content/drive/Shareddrives/5500 Database Project/datasets/books/books_data.csv")
#books_df = books_df.dropna()

In [None]:
books_df

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,
...,...,...,...,...,...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],http://books.google.com/books/content?id=J7M-N...,http://books.google.com/books?id=J7M-NwAACAAJ&...,Scholastic Paperbacks,2000-06-01,http://books.google.com/books?id=J7M-NwAACAAJ&...,['Juvenile Fiction'],2.0
212400,Red Boots for Christmas,Everyone in the village of Friedensdorf is hap...,,http://books.google.com/books/content?id=3n8k6...,http://books.google.com/books?id=3n8k6wl4BbYC&...,,1995,http://books.google.com/books?id=3n8k6wl4BbYC&...,['Juvenile Fiction'],
212401,Mamaw,"Give your Mamaw a useful, beautiful and though...",['Wild Wild Cabbage'],,http://books.google.com/books?id=zytVswEACAAJ&...,,2018-01-17,http://books.google.com/books?id=zytVswEACAAJ&...,,
212402,The Autograph Man,Alex-Li Tandem sells autographs. His business ...,['Zadie Smith'],http://books.google.com/books/content?id=JM6YV...,http://books.google.com/books?id=JM6YVPx_clMC&...,Vintage,2003-08-12,https://play.google.com/store/books/details?id...,['Fiction'],19.0


In [None]:
# cleans synopsis
books_clean_df = books_df[['Title', 'description', 'categories']].copy().dropna()
books_clean_df["description"] = books_clean_df["description"].apply(clean_text)
books_clean_df["categories"] = books_clean_df["categories"].apply(clean_categories)

books_clean_df

Unnamed: 0,Title,description,categories
1,Dr. Seuss: American Icon,philip nel takes fascinating look into the key...,biography autobiography
2,Wonderful Worship in Smaller Churches,this resource includes twelve principles under...,religion
3,Whispers of the Wicked Saints,julia thomas finds her life spinning out contr...,fiction
5,The Church of Christ: A Biblical Ecclesiology ...,the church christ biblical ecclesiology for to...,religion
8,Saint Hyacinth of Poland,the story for children and hyacinth the domini...,biography autobiography
...,...,...,...
212397,The Magic of the Soul: Applying Spiritual Powe...,the magic the soul applying spiritual power da...,body mind spirit
212398,Autodesk Inventor 10 Essentials Plus,autodesk inventor 2017 essentials plus provide...,computers
212399,The Orphan Of Ellis Island (Time Travel Advent...,during school trip ellis island dominick avaro...,juvenile fiction
212400,Red Boots for Christmas,everyone the village friedensdorf happily prep...,juvenile fiction


In [None]:
# creates tokenized columns
books_clean_df["tokenized description"] = books_clean_df["description"].apply(tokenize_words)
books_clean_df["tokenized categories"] = books_clean_df["categories"].apply(tokenize_words)

pd.reset_option("^display.", silent=True)
books_clean_df.head()

Unnamed: 0,Title,description,categories,tokenized description,tokenized categories
1,Dr. Seuss: American Icon,philip nel takes fascinating look into the key...,biography autobiography,"[philip, nel, takes, fascinating, look, key, a...","[biography, autobiography]"
2,Wonderful Worship in Smaller Churches,this resource includes twelve principles under...,religion,"[resource, includes, twelve, principles, under...",[religion]
3,Whispers of the Wicked Saints,julia thomas finds her life spinning out contr...,fiction,"[julia, thomas, finds, life, spinning, control...",[fiction]
5,The Church of Christ: A Biblical Ecclesiology ...,the church christ biblical ecclesiology for to...,religion,"[church, christ, biblical, ecclesiology, today...",[religion]
8,Saint Hyacinth of Poland,the story for children and hyacinth the domini...,biography autobiography,"[story, children, hyacinth, dominican, planted...","[biography, autobiography]"


In [None]:
books_clean_df = books_clean_df[['Title', 'tokenized description', 'tokenized categories']]
books_clean_df

Unnamed: 0,Title,tokenized description,tokenized categories
1,Dr. Seuss: American Icon,"[philip, nel, takes, fascinating, look, key, a...","[biography, autobiography]"
2,Wonderful Worship in Smaller Churches,"[resource, includes, twelve, principles, under...",[religion]
3,Whispers of the Wicked Saints,"[julia, thomas, finds, life, spinning, control...",[fiction]
5,The Church of Christ: A Biblical Ecclesiology ...,"[church, christ, biblical, ecclesiology, today...",[religion]
8,Saint Hyacinth of Poland,"[story, children, hyacinth, dominican, planted...","[biography, autobiography]"
...,...,...,...
212397,The Magic of the Soul: Applying Spiritual Powe...,"[magic, soul, applying, spiritual, power, dail...","[body, mind, spirit]"
212398,Autodesk Inventor 10 Essentials Plus,"[autodesk, inventor, 2017, essentials, plus, p...",[computers]
212399,The Orphan Of Ellis Island (Time Travel Advent...,"[school, trip, ellis, island, dominick, avaro,...","[juvenile, fiction]"
212400,Red Boots for Christmas,"[everyone, village, friedensdorf, happily, pre...","[juvenile, fiction]"


In [None]:
# Set up multiprocessing
pool = multiprocessing.Pool()

# Apply the function to each row in parallel
result = pool.map(compute_score_cosine_multiprocess_two_col, books_clean_df.itertuples(name=None, index=False))

# Convert the result back to a DataFrame
result_df = pd.DataFrame(result, columns= ['Title'] + categories)

# Close the multiprocessing pool
pool.close()

In [None]:
result_df

Unnamed: 0,Title,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,Dr. Seuss: American Icon,0.291404,0.217958,0.228603,0.253020,0.190739,0.270884,0.177613,0.204041,0.314247,...,0.268341,0.215016,0.191885,0.170489,0.318669,0.245709,0.312313,0.142570,0.221067,0.293717
1,Wonderful Worship in Smaller Churches,0.336489,0.215621,0.242078,0.314812,0.244904,0.284240,0.192283,0.202668,0.282636,...,0.259469,0.250688,0.197706,0.184916,0.418788,0.205183,0.340373,0.158482,0.208416,0.290690
2,Whispers of the Wicked Saints,0.348229,0.251117,0.290175,0.294510,0.233746,0.298543,0.158418,0.273149,0.372322,...,0.325371,0.297882,0.189298,0.154828,0.325900,0.187986,0.300513,0.169498,0.183914,0.319195
3,The Church of Christ: A Biblical Ecclesiology ...,0.292150,0.170954,0.211067,0.273790,0.195737,0.259580,0.177237,0.181398,0.283821,...,0.233510,0.225550,0.223509,0.147225,0.347746,0.220425,0.314263,0.131937,0.186620,0.253926
4,Saint Hyacinth of Poland,0.317763,0.206846,0.240648,0.258704,0.180263,0.252660,0.165674,0.200972,0.303892,...,0.261850,0.239411,0.181055,0.162399,0.294596,0.182795,0.264051,0.137728,0.184102,0.256331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137883,The Magic of the Soul: Applying Spiritual Powe...,0.309113,0.189667,0.240463,0.278003,0.225096,0.319730,0.177066,0.239414,0.305709,...,0.269672,0.271353,0.216843,0.179154,0.385954,0.262358,0.339990,0.147749,0.223786,0.313935
137884,Autodesk Inventor 10 Essentials Plus,0.275103,0.192833,0.202967,0.237472,0.202178,0.262415,0.226699,0.178820,0.246634,...,0.229295,0.214877,0.162332,0.151384,0.385144,0.245152,0.334622,0.154362,0.226138,0.303941
137885,The Orphan Of Ellis Island (Time Travel Advent...,0.316078,0.223510,0.237648,0.273552,0.202691,0.247362,0.155761,0.216683,0.292383,...,0.253757,0.191675,0.153839,0.160623,0.275005,0.130021,0.243258,0.160970,0.151742,0.271342
137886,Red Boots for Christmas,0.421379,0.230307,0.233502,0.300786,0.269557,0.289144,0.155601,0.268420,0.341008,...,0.302812,0.225464,0.194702,0.163711,0.316193,0.188863,0.276779,0.231319,0.222915,0.291453


In [None]:
for idx, category in enumerate(categories):

  # creates new columns for each category and computes the score
  max_value = result_df[category].max()
  min_value = result_df[category].min()
  result_df[category] = round(((result_df[category] - min_value) / (max_value - min_value))*100, 2)

In [None]:
result_df

Unnamed: 0,Title,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
0,Dr. Seuss: American Icon,46.06,53.43,49.12,50.22,52.51,71.56,43.06,58.57,60.54,...,55.12,53.12,46.47,55.91,48.09,58.81,37.82,46.87,54.65,63.39
1,Wonderful Worship in Smaller Churches,51.41,53.06,50.98,59.33,63.43,72.95,45.83,58.35,56.25,...,53.82,59.12,47.37,59.38,60.27,51.42,40.54,49.78,52.57,62.91
2,Whispers of the Wicked Saints,52.81,58.72,57.63,56.34,61.18,74.45,39.44,69.51,68.42,...,63.42,67.05,46.06,52.14,48.97,48.29,36.68,51.80,48.54,67.44
3,The Church of Christ: A Biblical Ecclesiology ...,46.15,45.93,46.69,53.28,53.52,70.37,42.99,54.98,56.41,...,50.05,54.89,51.40,50.31,51.63,54.20,38.01,44.92,48.98,57.07
4,Saint Hyacinth of Poland,49.19,51.66,50.79,51.06,50.40,69.65,40.81,58.08,59.13,...,54.17,57.22,44.78,53.96,45.16,47.34,33.15,45.98,48.57,57.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137883,The Magic of the Soul: Applying Spiritual Powe...,48.16,48.92,50.76,53.90,59.43,76.67,42.96,64.17,59.38,...,55.31,62.59,50.36,57.99,56.28,61.85,40.50,47.82,55.10,66.60
137884,Autodesk Inventor 10 Essentials Plus,44.13,49.43,45.57,47.93,54.82,70.67,52.32,54.57,51.37,...,49.43,53.10,41.86,51.31,56.18,58.71,39.98,49.03,55.49,65.02
137885,The Orphan Of Ellis Island (Time Travel Advent...,48.99,54.32,50.37,53.25,54.92,69.09,38.94,60.57,57.57,...,52.99,49.20,40.53,53.53,42.77,37.72,31.13,50.23,43.24,59.83
137886,Red Boots for Christmas,61.49,55.40,49.80,57.26,68.39,73.47,38.91,68.76,64.17,...,60.13,54.88,46.90,54.28,47.78,48.45,34.38,63.11,54.96,63.03


In [None]:
result_df.describe()

Unnamed: 0,christmas,halloween,valentine,celebration,relaxing,nature,industrial,sunshine,sad,happy,...,scary,anger,optimistic,adventurous,learning,artistic,science,cozy,colorful,space
count,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,...,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0
mean,47.120068,52.3683,49.43807,51.861723,54.330613,71.477897,45.171699,59.374569,57.531102,53.688417,...,54.925756,56.665054,45.164884,54.167864,49.526644,52.712829,37.344561,48.608871,51.782261,63.898523
std,6.752109,7.200275,7.079512,6.363065,7.761711,4.36602,6.839756,7.469133,8.033507,8.382409,...,7.497385,7.019307,5.40789,6.618005,6.728204,6.683084,4.972809,5.812259,6.386588,7.301646
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,43.38,47.91,45.24,48.86,50.32,69.44,40.99,55.02,52.88,48.58,...,50.67,53.15,42.58,51.07,45.93,49.44,34.7,45.47,48.75,60.35
50%,47.13,52.25,49.37,52.19,54.42,71.7,44.83,59.32,57.47,53.47,...,54.865,56.97,45.54,54.57,49.63,53.01,37.45,48.5,51.78,64.37
75%,51.09,56.84,53.78,55.42,58.89,73.97,49.29,63.97,62.45,58.87,...,59.47,60.79,48.37,58.01,53.65,56.64,40.27,51.83,55.06,68.26
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [None]:
# Saves the new scores in a csv file
result_df.to_csv("/content/drive/Shareddrives/5500 Database Project/datasets/books/books_scores.csv")