In [1]:
import ast
import itertools
import numpy as np
import pandas as pd
import re
import spacy
import torch
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
df = pd.read_csv('dataset/movies_metadata.csv', encoding='utf-8')
df_cleaned = df.dropna()
df.shape

  df = pd.read_csv('dataset/movies_metadata.csv', encoding='utf-8')


(45466, 24)

In [4]:
# df['budget'] = df['budget'].astype(int)
df['genres'] = df['genres'].apply(ast.literal_eval)
df['genre_names'] = df['genres'].apply(lambda genre_list: [genre['name'] for genre in genre_list])
# df['production_companies'] = df['production_companies'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
# df['production_companies_names'] = df['production_companies'].apply(lambda genre_list: [genre['name'] for genre in genre_list])
df['production_companies_names'] = df['production_companies'].apply(
    lambda genre_list: [genre['name'] for genre in genre_list] if isinstance(genre_list, list) else []
)
# df['production_companies_names'] = df['production_companies'].apply(lambda genre_list: [genre['name'] if pd.notna(genre_list) else [] for genre in genre_list])
# df['production_countries'] = df['production_countries'].apply(ast.literal_eval)
# df['production_countries_names'] = df['production_countries'].apply(lambda genre_list: [genre['name'] for genre in genre_list])
df['production_countries_names'] = df['production_countries'].apply(
    lambda genre_list: [genre['name'] for genre in genre_list] if isinstance(genre_list, list) else []
)
# df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d', errors='coerce')
df_cleaned = df.dropna(subset=['release_date'])
df['release_year'] = df['release_date'].dt.year
df = df.drop(columns=['belongs_to_collection', 'homepage', 'status', 'video', 'imdb_id', 'budget', 'poster_path', 'revenue', 'spoken_languages', 'genres', 'production_companies', 'production_countries', 'release_date' ])
df.rename(columns={'genre_names': 'genres', 'production_companies_names': 'production_companies', 'production_countries_names' : 'production_countries'}, inplace=True)
df.dtypes
df.to_csv('dataset/movies_metadata_preprocessed_big.csv', index=False) 
df

Unnamed: 0,adult,id,original_language,original_title,overview,popularity,runtime,tagline,title,vote_average,vote_count,genres,production_companies,production_countries,release_year
0,False,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,81.0,,Toy Story,7.7,5415.0,"[Animation, Comedy, Family]",[],[],1995.0
1,False,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"[Adventure, Fantasy, Family]",[],[],1995.0
2,False,15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,"[Romance, Comedy]",[],[],1995.0
3,False,31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,"[Comedy, Drama, Romance]",[],[],1995.0
4,False,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,[Comedy],[],[],1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,439050,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,90.0,Rising and falling between a man and woman,Subdue,4.0,1.0,"[Drama, Family]",[],[],
45462,False,111109,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,360.0,,Century of Birthing,9.0,3.0,[Drama],[],[],2011.0
45463,False,67758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,90.0,A deadly game of wits.,Betrayal,3.8,6.0,"[Action, Drama, Thriller]",[],[],2003.0
45464,False,227506,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,87.0,,Satan Triumphant,0.0,0.0,[],[],[],1917.0


In [5]:

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# # Example words
# words = ["goes", "roads", "leaves"]

# # Process the words and get lemmas
# lemmas = [token.lemma_ for token in nlp(" ".join(words))]
# print(lemmas)  # Output: ['go', 'road', 'leaf']

In [6]:
def to_base_forms(s):
    try:
        # s = s.split()
        # print(s, "!!!")
        pattern = r"[^a-zA-Z']"
        s = " ".join(map(lambda x: (re.sub(pattern, "", x)).lower(), s.split()))
        # print(s)
        return (" ".join([token.lemma_ for token in nlp(s)]))
    except:
        # print(s)
        return None

vectorized_function = np.vectorize(to_base_forms)
mapped_arr = vectorized_function(df["overview"].values)

In [7]:
mapped_arr


array(["lead by woody andy 's toy live happily in his room until andy 's birthday bring buzz lightyear onto the scene afraid of lose his place in andy 's heart woody plot against buzz but when circumstance separate buzz and woody from their owner the duo eventually learn to put aside their difference",
       "when sibling judy and peter discover an enchanted board game that open the door to a magical world they unwittingly invite alan   an adult who be be trap inside the game for   year   into their living room alan 's only hope for freedom be to finish the game which prove risky as all three find themselves run from giant rhinoceros evil monkey and other terrifying creature",
       'a family wedding reignite the ancient feud between nextdoor neighbor and fishing buddy john and max meanwhile a sultry italian divorce open a restaurant at the local bait shop alarm the local who worry she will scare the fish away but she be less interested in seafood than she be in cook up a hot time wi

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform([" ".join(mapped_arr)])

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense format and print
dense_matrix = tfidf_matrix.todense()

feature_names = vectorizer.get_feature_names_out()
tfidf_array = np.array(dense_matrix)
word_index = feature_names.tolist().index('friend')
word_tfidf = tfidf_array[:, word_index][0]
print("TF-IDF value for 'love':" + str(word_tfidf))
word_tfidf.shape
# print("TF-IDF Matrix:")
# print(dense_matrix)

TF-IDF value for 'love':0.13840590234633166


()

In [9]:
"to" in feature_names

False

In [10]:
tf_idf = {}
c = 0
banned_words = ['life', 'new', "film", "come", "make", "movie"]
for w in set(" ".join(mapped_arr).split()):
    try:
        if w in banned_words:
            tf_idf[w] = 0
        elif w not in feature_names:
            tf_idf[w] = 0
        else:
            word_index = feature_names.tolist().index(w)
            word_tfidf = tfidf_array[:, word_index][0].item()
            tf_idf[w] = word_tfidf
    except:
        c += 1
        print(w)
        tfidf_array[w] = 0
print(c)

0


In [11]:
tf_idf

{'tsarist': 0.00011553080329409987,
 'indomitable': 0.00031770970905877464,
 'interrogator': 0.00020217890576467479,
 'torsillo': 5.7765401647049936e-05,
 'assbreaker': 2.8882700823524968e-05,
 'semashko': 2.8882700823524968e-05,
 'parsifal': 5.7765401647049936e-05,
 'qualifying': 2.8882700823524968e-05,
 'altruistic': 8.66481024705749e-05,
 'casse': 5.7765401647049936e-05,
 'arlo': 0.0002888270082352497,
 'iibirkenau': 2.8882700823524968e-05,
 'plymouth': 8.66481024705749e-05,
 'unflappable': 8.66481024705749e-05,
 'oprah': 8.66481024705749e-05,
 'nolte': 0.0001732962049411498,
 'feuerstein': 2.8882700823524968e-05,
 'terajima': 2.8882700823524968e-05,
 'mammut': 2.8882700823524968e-05,
 'farocki': 0.0001732962049411498,
 'pasupati': 2.8882700823524968e-05,
 'soho': 0.00025994430741172473,
 'lewly': 2.8882700823524968e-05,
 'edifis': 2.8882700823524968e-05,
 'missionprotect': 2.8882700823524968e-05,
 'ultrarealistic': 2.8882700823524968e-05,
 'versaille': 0.0002888270082352497,
 'indu

In [12]:
sorted(tf_idf.items(), key=lambda x: x[1], reverse=False)


[("o'grimm", 0),
 ("d'hebergement", 0),
 ("qu'une", 0),
 ("bennigan'slike", 0),
 ('get', 0),
 ("o'casey", 0),
 ("vaudeville'artist", 0),
 ("itcouldn'thappentome", 0),
 ("un'autostrada", 0),
 ('due', 0),
 ('however', 0),
 ('bill', 0),
 ('thence', 0),
 ("a'la", 0),
 ('a', 0),
 ('be', 0),
 ("n'arrive", 0),
 ("d'homme", 0),
 ('n', 0),
 ('between', 0),
 ('nevertheless', 0),
 ("don'twannabe", 0),
 ("l'isleadam", 0),
 ("o'kelly", 0),
 ('could', 0),
 ('k', 0),
 ("lover'in", 0),
 ('name', 0),
 ("o'byrne", 0),
 ('done', 0),
 ('within', 0),
 ("o'shea", 0),
 ('via', 0),
 ("d'abo", 0),
 ("d'italia", 0),
 ("l'espace", 0),
 ("c'est", 0),
 ('indeed', 0),
 ('whether', 0),
 ('its', 0),
 ("d'agostino", 0),
 ('all', 0),
 ('ourselves', 0),
 ("z'ev", 0),
 ('this', 0),
 ('around', 0),
 ('below', 0),
 ('both', 0),
 ("louie'shis", 0),
 ('mostly', 0),
 ('thus', 0),
 ("o'hare", 0),
 ("d'immeubles", 0),
 ("l'age", 0),
 ("o'riley", 0),
 ("saboteurwho'll", 0),
 ('once', 0),
 ('p', 0),
 ("o'sullivan", 0),
 ("d'tre",

In [13]:
overview_keywords = list(map(lambda r: list(filter(lambda w: tf_idf[w] > 0,r.split())),mapped_arr))
df["overview_keywords"] = overview_keywords

In [14]:
"overview" in df, "overview_keywords" in df
df[['overview', 'overview_keywords']]

Unnamed: 0,overview,overview_keywords
0,"Led by Woody, Andy's toys live happily in his ...","[lead, woody, andy, toy, live, happily, room, ..."
1,When siblings Judy and Peter discover an encha...,"[sibling, judy, peter, discover, enchanted, bo..."
2,A family wedding reignites the ancient feud be...,"[family, wedding, reignite, ancient, feud, nex..."
3,"Cheated on, mistreated and stepped on, the wom...","[cheat, mistreat, step, woman, hold, breath, w..."
4,Just when George Banks has recovered from his ...,"[just, george, bank, recover, daughter, weddin..."
...,...,...
45461,Rising and falling between a man and woman.,"[rise, fall, man, woman]"
45462,An artist struggles to finish his work while a...,"[artist, struggle, finish, work, storyline, cu..."
45463,"When one of her hits goes wrong, a professiona...","[hit, wrong, professional, assassin, end, suit..."
45464,"In a small town live two brothers, one a minis...","[small, town, live, brother, minister, hunchba..."


In [None]:
df.tail(n = 50)

Unnamed: 0,adult,id,original_language,original_title,overview,popularity,runtime,tagline,title,vote_average,vote_count,genres,production_companies,production_countries,release_year,overview_keywords
404,False,117026,en,Daughter of Dr. Jekyll,A young woman discovers she is the daughter of...,0.144075,71.0,Blood-hungry spawn of the world's most bestial...,Daughter of Dr. Jekyll,5.5,2,"[Fantasy, Horror]","[Allied Artists Pictures, Film Venturers]",[United States of America],1957,"[young, woman, discover, daughter, infamous, d..."
405,False,4952,en,Pillow Talk,A man and woman share a telephone line and des...,2.208919,102.0,Footloose bachelor...beautiful career girl...a...,Pillow Talk,6.8,49,"[Comedy, Romance]","[Arwin Productions, Universal International Pi...",[United States of America],1959,"[man, woman, share, telephone, line, despise, ..."
406,False,284154,sr,Ubistvo s predumišljajem,"Men, women, and war. Jelena Panic is a young w...",0.152705,94.0,,Premeditated Murder,8.3,3,[Drama],[Cinema Design],[Serbia],1995,"[man, woman, war, jelena, panic, young, woman,..."
407,False,42467,en,Tüzoltó utca 25.,"On one hot summer night, the residents of a Hu...",0.00022,97.0,,25 Fireman's Street,5.0,1,"[Foreign, Drama, Romance]",[],[Hungary],1973,"[hot, summer, night, resident, hungarian, apar..."
408,False,56481,sv,Gränsen,December 1942. Two young soldiers leave their ...,0.380827,122.0,The Mission is a Man,Beyond the Border,5.8,5,"[Action, Drama, War]","[Cinematic Vision, Searock Films]",[Sweden],2011,"[december, young, soldier, leave, post, checkp..."
409,False,55603,en,The Golden Eye,"A gold mine in Arizona, that was formerly losi...",0.255532,69.0,Death lurks at every step...as your oriental s...,The Golden Eye,5.8,6,"[Crime, Mystery, Thriller]",[Monogram Pictures],[United States of America],1948,"[gold, arizona, lose, lot, money, suddenly, tu..."
410,False,43700,en,Big Money Rustlas,The Insane Clown Posse heads back to the Wild ...,0.252094,95.0,"The good, the bad and the outrages!",Big Money Rustlas,3.8,3,"[Action, Comedy, Western]",[],[],2010,"[insane, clown, posse, head, wild, west, prequ..."
411,False,64044,ru,Mushketyory 20 Let Spustya,The new adventures of four musketeer friends.....,0.716771,300.0,,Musketeers 20 Years Later,5.0,2,[Adventure],[],[Russia],1992,"[adventure, musketeer, friend, base, alexandre..."
412,False,93669,fr,Les Rendez-vous d'Anna,Anna is a film director whose job takes her al...,0.00195,120.0,,The Meetings of Anna,7.3,4,[Drama],"[Zweites Deutsches Fernsehen (ZDF), Paradise F...","[Belgium, France, Germany]",1978,"[anna, director, job, western, europe, place, ..."
413,False,21882,en,Gator,"After his release from prison, notorious ex-co...",3.078583,115.0,Come and get him.,Gator,4.9,16,"[Drama, Crime, Action]","[United Artists, Levy-Gardner-Laven]",[United States of America],1976,"[release, prison, notorious, excon, moonshine,..."


In [14]:
def weighed_diff(v1, v2, w):
    return abs(v1 - v2) * w

def calculate_heuristic_similarity(df):
    pairs = list(itertools.combinations(df['id'], 2))
    c = 0
    n = len(pairs)
    differences = []
    for id1, id2 in pairs:
        row1 = df.loc[df['id'] == id1]
        row2 = df.loc[df['id'] == id2]
        print(row1["popularity"].values[0], row2["popularity"].values[0])
        revenue1 = df.loc[df['id'] == id1, 'popularity'].values[0]
        revenue2 = df.loc[df['id'] == id2, 'popularity'].values[0]
        difference = abs(revenue1 - revenue2)
        differences.append((id1, id2, difference))
        if c  == 3:
            print(f'Iter: {c}/{n}, Pair ({id1}, {id2}): revenue difference = {difference}')
            break
        c += 1
    return pd.DataFrame(differences, columns=['id1', 'id2', 'difference'])

In [None]:
calculate_heuristic_similarity(df)

0.118581 10.03959
0.118581 2.517507
0.118581 1.27024
0.118581 1.916933
Iter: 3/102831, Pair (291907, 35172): revenue difference = 1.798352


Unnamed: 0,id1,id2,difference
0,291907,27958,9.921009
1,291907,217471,2.398926
2,291907,39243,1.151659
3,291907,35172,1.798352


In [18]:
df.to_csv('dataset/movies_metadata_ready_big.csv', index=False) 