In [303]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
movies_data = pd.read_csv("./data/movies.csv")

In [305]:
movies_data.columns

Index(['_id', 'genres', 'image_url', 'imdb_id', 'imdb_link', 'movie_id',
       'movie_title', 'original_language', 'overview', 'popularity',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'tmdb_id', 'tmdb_link', 'vote_average', 'vote_count', 'year_released'],
      dtype='object')

In [306]:
movies_data.shape

(30000, 19)

In [307]:
movies_data["movie_id"].nunique()

7651

In [308]:
dataset = movies_data.drop_duplicates(subset='movie_id', keep='first')

In [309]:
dataset.shape

(7651, 19)

In [310]:
dataset.isna().sum()

_id                       0
genres                  238
image_url               433
imdb_id                 906
imdb_link               906
movie_id                  0
movie_title              49
original_language       238
overview                485
popularity              239
production_countries    239
release_date            295
runtime                 322
spoken_languages        239
tmdb_id                 115
tmdb_link               115
vote_average            239
vote_count              239
year_released           130
dtype: int64

In [311]:
dataset["movie_id"].nunique()

7651

In [312]:
dataset.shape

(7651, 19)

In [313]:
dataset.isna().sum()

_id                       0
genres                  238
image_url               433
imdb_id                 906
imdb_link               906
movie_id                  0
movie_title              49
original_language       238
overview                485
popularity              239
production_countries    239
release_date            295
runtime                 322
spoken_languages        239
tmdb_id                 115
tmdb_link               115
vote_average            239
vote_count              239
year_released           130
dtype: int64

In [314]:
dataset.head()

Unnamed: 0,_id,genres,image_url,imdb_id,imdb_link,movie_id,movie_title,original_language,overview,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc85f606758f69634496fd3,"[""Music"",""Animation""]",film-poster/4/6/4/4/4/0/464440-football-freaks...,,,football-freaks,Football Freaks,en,"Football crazy, football mad. Don’t watch this...",0.6,"[""United Kingdom""]",1971-12-05,0.0,[],535272.0,https://www.themoviedb.org/movie/535272/,0.0,0.0,1971.0
1,5fc85ff26758f696344ace0c,[],film-poster/2/4/5/5/0/0/245500-aftermath-0-230...,tt0586129,http://www.imdb.com/title/tt0586129/maindetails,aftermath-1960,Aftermath,en,Aftermath was the pilot for an unsold TV serie...,0.6,[],1960-04-17,22.0,[],318331.0,https://www.themoviedb.org/movie/318331/,8.0,1.0,1960.0
2,5fc85f606758f69634496fcd,"[""Drama""]",film-poster/9/3/3/1/8/93318-where-chimneys-are...,tt0045731,http://www.imdb.com/title/tt0045731/maindetails,where-chimneys-are-seen,Where Chimneys Are Seen,ja,Gosho’s most celebrated film both in Japan and...,1.568,"[""Japan""]",1953-03-05,108.0,"[""日本語""]",117779.0,https://www.themoviedb.org/movie/117779/,6.6,10.0,1953.0
3,5fc85f606758f69634496fd1,"[""Drama""]",,tt0187327,http://www.imdb.com/title/tt0187327/maindetails,the-musicians-daughter,The Musician's Daughter,en,Carl Wagner's good wife was dying. His heart b...,0.6,"[""United States of America""]",1911-12-12,15.0,[],560377.0,https://www.themoviedb.org/movie/560377/,0.0,0.0,1911.0
4,5fc85f606758f69634496fd4,"[""Documentary""]",film-poster/4/5/4/6/0/3/454603-50-years-of-fab...,tt4769914,http://www.imdb.com/title/tt4769914/maindetails,50-years-of-fabulous,50 Years of Fabulous,en,50 Years of Fabulous recounts the rich history...,0.6,[],2018-05-17,75.0,[],525187.0,https://www.themoviedb.org/movie/525187/,0.0,0.0,2018.0


In [315]:
dataset.dtypes

_id                      object
genres                   object
image_url                object
imdb_id                  object
imdb_link                object
movie_id                 object
movie_title              object
original_language        object
overview                 object
popularity              float64
production_countries     object
release_date             object
runtime                 float64
spoken_languages         object
tmdb_id                 float64
tmdb_link                object
vote_average            float64
vote_count              float64
year_released           float64
dtype: object

In [None]:
def safe_literal_eval(x):
    try:
        return literal_eval(x)
    except (ValueError, SyntaxError):
        return []  # Return an empty list for malformed entries

dataset = dataset.dropna(subset=['genres']).copy()

In [None]:
def create_soup(x):
    movie_title = str(x["movie_title"]) if pd.notna(x["movie_title"]) else ""
    genres = ' '.join(str(genre) for genre in x["genres"]) if isinstance(x["genres"], list) else ""
    overview = str(x["overview"]) if pd.notna(x["overview"]) else ""
    
    return " " + movie_title + " " + genres + " " + overview

dataset["soup"] = dataset.apply(create_soup, axis=1)
print(dataset[["movie_title", "soup"]].head())

               movie_title                                               soup
0          Football Freaks   Football Freaks ["Music","Animation"] Footbal...
1                Aftermath   Aftermath [] Aftermath was the pilot for an u...
2  Where Chimneys Are Seen   Where Chimneys Are Seen ["Drama"] Gosho’s mos...
3  The Musician's Daughter   The Musician's Daughter ["Drama"] Carl Wagner...
4     50 Years of Fabulous   50 Years of Fabulous ["Documentary"] 50 Years...


In [318]:
dataset.shape

(7413, 20)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(dataset["soup"])

In [320]:
cosine_sim = cosine_similarity(matrix, matrix)

In [321]:
def recommend_movies(title, cosine_sim=cosine_sim, df=dataset):
    idx = df.index[df['movie_title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    movie_indices = [i[0] for i in sim_scores]
    recommendations = df.iloc[movie_indices][:25][['movie_title', 'overview']]
    recommendations['similarity'] = [score[1] for score in sim_scores][:25]  # Add similarity score
    return recommendations

In [322]:
recommended_movies = recommend_movies('The War Speeds Up')
recommended_movies.reset_index()
recommended_movies

Unnamed: 0,movie_title,overview,similarity
297,Fat,Ken is a man that won't change his ways. Addic...,0.174713
3683,Our Daily Bread,Welcome to the world of industrial food produc...,0.170858
6965,10 Min,,0.170105
2766,Mixed Doubles,To get revenge on her ex-boyfriend and to help...,0.158116
3055,I Pay for Your Story,"Lech Kowalski returns to Utica (New York), whe...",0.154484
2193,Suffer Little Children,This short documentary is part of the Canada C...,0.151112
4549,I'll Be Around,A 30-something single mother must find adoptiv...,0.150154
6970,The Fighting 69½th,Battalions of red and black ants go to war ove...,0.145151
2435,Now You Know,An uninhibited 10 year old girl discovers hars...,0.141774
1838,Table for Three,A suddenly single guy invites what he thinks i...,0.14177
