In [169]:
import ast
import itertools
import numpy as np
import pandas as pd
import spacy
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
import re


In [117]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [124]:
df = pd.read_csv('dataset/movies_metadata_small.csv', encoding='utf-8')

# df['budget'] = df['budget'].astype(int)
df['genres'] = df['genres'].apply(ast.literal_eval)
df['genre_names'] = df['genres'].apply(lambda genre_list: [genre['name'] for genre in genre_list])
df['production_companies'] = df['production_companies'].apply(ast.literal_eval)
df['production_companies_names'] = df['production_companies'].apply(lambda genre_list: [genre['name'] for genre in genre_list])
df['production_countries'] = df['production_countries'].apply(ast.literal_eval)
df['production_countries_names'] = df['production_countries'].apply(lambda genre_list: [genre['name'] for genre in genre_list])
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year
df = df.drop(columns=['belongs_to_collection', 'homepage', 'status', 'video', 'imdb_id', 'budget', 'poster_path', 'revenue', 'spoken_languages', 'genres', 'production_companies', 'production_countries', 'release_date' ])
df.rename(columns={'genre_names': 'genres', 'production_companies_names': 'production_companies', 'production_countries_names' : 'production_countries'}, inplace=True)
df.dtypes
df.to_csv('dataset/movies_metadata_preprocessed.csv', index=False) 
df

Unnamed: 0,adult,id,original_language,original_title,overview,popularity,runtime,tagline,title,vote_average,vote_count,genres,production_companies,production_countries,release_year
0,False,291907,en,Mustasukkaisuus,Love triangle near roaring rapids. Two sisters...,0.118581,90.0,,Jealousy,8.0,1,"[Romance, Drama]",[],[],1953
1,False,27958,en,Cruising,A serial killer brutally slays and dismembers ...,10.039590,102.0,Al Pacino is Cruising for a killer.,Cruising,6.0,89,"[Horror, Action, Drama, Mystery, Thriller]","[Lorimar Film Entertainment, CiP - Europaische...","[Germany, United States of America]",1980
2,False,217471,en,Medeas,The film is a lyrical exploration of a particu...,2.517507,98.0,,Medeas,6.7,6,[],[],[],2013
3,False,39243,en,Off Beat,Joe Gower's job is skating through library she...,1.270240,92.0,The Real Life Adventures Of A Make-Believe Cop.,Off Beat,4.3,3,[Comedy],[Touchstone Pictures],[United States of America],1986
4,False,35172,fr,Le fils,A joinery instructor at a rehab center refuses...,1.916933,103.0,,The Son,6.5,34,"[Mystery, Drama]","[Les Films Du Fleuve, Archipel 35, Radio Télév...","[Belgium, France]",2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,False,21325,ja,Onmyoji,"During a dark time in the Heian period, when e...",0.450855,112.0,,Onmyoji: The Yin Yang Master,7.3,3,"[Drama, Fantasy, Horror]",[Toho Company],[Japan],2001
450,False,62851,en,軍旗はためく下に,A war widow determined to clear the name of he...,0.069476,96.0,,Under the Flag of the Rising Sun,6.6,6,"[Drama, Foreign, History, Mystery]",[Toho Company],[Japan],1972
451,False,135678,en,Leave,Henry Harper is a successful novelist who has ...,1.778183,84.0,,Leave,6.5,8,"[Thriller, Mystery]",[],[United States of America],2011
452,False,1487,en,Hellboy,"In the final days of World War II, the Nazis a...",13.206854,122.0,From the Dark Side to Our Side.,Hellboy,6.5,2278,"[Fantasy, Action, Science Fiction]","[Columbia Pictures, Revolution Studios, Dark H...",[United States of America],2004


In [170]:

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# # Example words
# words = ["goes", "roads", "leaves"]

# # Process the words and get lemmas
# lemmas = [token.lemma_ for token in nlp(" ".join(words))]
# print(lemmas)  # Output: ['go', 'road', 'leaf']

In [199]:
def to_base_forms(s):
    try:
        # s = s.split()
        print(s, "!!!")
        pattern = r"[^a-zA-Z']"
        s = " ".join(map(lambda x: (re.sub(pattern, "", x)).lower(), s.split()))
        print(s)
        return (" ".join([token.lemma_ for token in nlp(s)]))
    except:
        # print(s)
        return None

vectorized_function = np.vectorize(to_base_forms)
mapped_arr = vectorized_function(df["overview"].values)

Love triangle near roaring rapids. Two sisters fight for the attention of a handsome forester. !!!
love triangle near roaring rapids two sisters fight for the attention of a handsome forester
Love triangle near roaring rapids. Two sisters fight for the attention of a handsome forester. !!!
love triangle near roaring rapids two sisters fight for the attention of a handsome forester
A serial killer brutally slays and dismembers several gay men in New York's S/M and leather districts. The young police officer Steve Burns is sent undercover onto the streets as decoy for the murderer. Working almost completely isolated from his department, he has to learn and practice the complex rules and signals of this little society. !!!
a serial killer brutally slays and dismembers several gay men in new york's sm and leather districts the young police officer steve burns is sent undercover onto the streets as decoy for the murderer working almost completely isolated from his department he has to learn

In [200]:
mapped_arr


array(['love triangle near roar rapid two sister fight for the attention of a handsome forester',
       "a serial killer brutally slay and dismember several gay man in new york 's sm and leather district the young police officer steve burn be send undercover onto the street as decoy for the murderer work almost completely isolated from his department he have to learn and practice the complex rule and signal of this little society",
       'the film be a lyrical exploration of a particular family situation and the human relationship within it devoid of any moral judgment the eye of the director study the boundary of human behaviour and explore how far an individual can go drive by love and the spirit of survival andrea pallaoro be bear in trento but at the age of   move to california to study filmmake his short film wunderkammer be present at the sundance film festival this year he bring his first featurelength film to the orizzonti section',
       "joe gower 's job be skate through l

In [201]:
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform([" ".join(mapped_arr)])

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense format and print
dense_matrix = tfidf_matrix.todense()

feature_names = vectorizer.get_feature_names_out()
tfidf_array = np.array(dense_matrix)
word_index = feature_names.tolist().index('friend')
word_tfidf = tfidf_array[:, word_index][0]
print("TF-IDF value for 'love':" + str(word_tfidf))
word_tfidf.shape
# print("TF-IDF Matrix:")
# print(dense_matrix)

TF-IDF value for 'love':0.1539769484892359


()

In [202]:
"to" in feature_names

False

In [209]:
tf_idf = {}
c = 0
banned_words = ['life', 'new', "film", "come", "make", "movie"]
for w in set(" ".join(mapped_arr).split()):
    try:
        if w in banned_words:
            tf_idf[w] = 0
        elif w not in feature_names:
            tf_idf[w] = 0
        else:
            word_index = feature_names.tolist().index(w)
            word_tfidf = tfidf_array[:, word_index][0].item()
            tf_idf[w] = word_tfidf
    except:
        c += 1
        print(w)
        tfidf_array[w] = 0
print(c)

0


In [210]:
tf_idf

{'elf': 0.0027495883658792126,
 'flyer': 0.0027495883658792126,
 'policji': 0.0027495883658792126,
 'examination': 0.00824876509763764,
 'mixedrace': 0.0027495883658792126,
 'cassandra': 0.0027495883658792126,
 'evolve': 0.00824876509763764,
 'truant': 0.0027495883658792126,
 'honest': 0.0027495883658792126,
 'eddie': 0.005499176731758425,
 'bat': 0.01099835346351685,
 'indeed': 0,
 'convince': 0.01649753019527528,
 'dude': 0.005499176731758425,
 'awaken': 0.005499176731758425,
 'vineyard': 0.0027495883658792126,
 'maverick': 0.005499176731758425,
 'lip': 0.005499176731758425,
 'celebratory': 0.0027495883658792126,
 'selfdefense': 0.0027495883658792126,
 'urge': 0.0027495883658792126,
 'christmas': 0.019247118561154488,
 'instantly': 0.0027495883658792126,
 'farmer': 0.00824876509763764,
 'near': 0.01099835346351685,
 'suppose': 0.0027495883658792126,
 'until': 0,
 'employ': 0.0027495883658792126,
 'later': 0.0219967069270337,
 'libertador': 0.0027495883658792126,
 'kippur': 0.00274958

In [212]:
sorted(tf_idf.items(), key=lambda x: x[1], reverse=False)


[('indeed', 0),
 ('until', 0),
 ('too', 0),
 ('those', 0),
 ('before', 0),
 ('whereby', 0),
 ('s', 0),
 ('thus', 0),
 ('this', 0),
 ('do', 0),
 ('still', 0),
 ('nine', 0),
 ('latterly', 0),
 ('anything', 0),
 ('serious', 0),
 ('system', 0),
 ('beyond', 0),
 ('nevertheless', 0),
 ('formerly', 0),
 ('have', 0),
 ('but', 0),
 ('whom', 0),
 ('behind', 0),
 ('about', 0),
 ('very', 0),
 ('although', 0),
 ('it', 0),
 ('within', 0),
 ('he', 0),
 ('by', 0),
 ('out', 0),
 ('h', 0),
 ('not', 0),
 ('because', 0),
 ('elsewhere', 0),
 ('detail', 0),
 ('last', 0),
 ('take', 0),
 ('once', 0),
 ('via', 0),
 ('as', 0),
 ('bottom', 0),
 ('on', 0),
 ('during', 0),
 ('all', 0),
 ('that', 0),
 ('eight', 0),
 ('own', 0),
 ("d'angelo", 0),
 ('make', 0),
 ('top', 0),
 ('many', 0),
 ('first', 0),
 ('how', 0),
 ('nor', 0),
 ('must', 0),
 ('five', 0),
 ('most', 0),
 ('often', 0),
 ('why', 0),
 ('their', 0),
 ('nobody', 0),
 ('sixty', 0),
 ('sometimes', 0),
 ('if', 0),
 ('where', 0),
 ('several', 0),
 ('next', 0),

In [226]:
overview_keywords = list(map(lambda r: list(filter(lambda w: tf_idf[w] > 0,r.split())),mapped_arr))
df["overview_keywords"] = overview_keywords

In [231]:
"overview" in df, "overview_keywords" in df
df[['overview', 'overview_keywords']]

Unnamed: 0,overview,overview_keywords
0,Love triangle near roaring rapids. Two sisters...,"[love, triangle, near, roar, rapid, sister, fi..."
1,A serial killer brutally slays and dismembers ...,"[serial, killer, brutally, slay, dismember, ga..."
2,The film is a lyrical exploration of a particu...,"[lyrical, exploration, particular, family, sit..."
3,Joe Gower's job is skating through library she...,"[joe, gower, job, skate, library, shelf, fetch..."
4,A joinery instructor at a rehab center refuses...,"[joinery, instructor, rehab, center, refuse, t..."
...,...,...
449,"During a dark time in the Heian period, when e...","[dark, time, heian, period, evil, force, threa..."
450,A war widow determined to clear the name of he...,"[war, widow, determine, clear, disgrace, husba..."
451,Henry Harper is a successful novelist who has ...,"[henry, harper, successful, novelist, survive,..."
452,"In the final days of World War II, the Nazis a...","[final, day, world, war, ii, nazi, attempt, us..."


In [92]:
def weighed_diff(v1, v2, w):
    return abs(v1 - v2) * w

def calculate_heuristic_similarity(df):
    pairs = list(itertools.combinations(df['id'], 2))
    c = 0
    n = len(pairs)
    differences = []
    for id1, id2 in pairs:
        row1 = df.loc[df['id'] == id1]
        row2 = df.loc[df['id'] == id2]
        print(row1["popularity"].values[0], row2["popularity"].values[0])
        revenue1 = df.loc[df['id'] == id1, 'popularity'].values[0]
        revenue2 = df.loc[df['id'] == id2, 'popularity'].values[0]
        difference = abs(revenue1 - revenue2)
        differences.append((id1, id2, difference))
        if c  == 3:
            print(f'Iter: {c}/{n}, Pair ({id1}, {id2}): revenue difference = {difference}')
            break
        c += 1
    return pd.DataFrame(differences, columns=['id1', 'id2', 'difference'])

In [93]:
calculate_heuristic_similarity(df)

0.473876 6.278291
0.473876 1.395757
0.473876 1.225217
0.473876 0.850828
Iter: 3/10330785, Pair (364088, 367538): revenue difference = 0.376952


Unnamed: 0,id1,id2,difference
0,364088,81332,5.804415
1,364088,39246,0.921881
2,364088,36139,0.751341
3,364088,367538,0.376952
