In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [33]:
from scipy.spatial.distance import cosine
from sklearn.preprocessing import RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.neighbors import NearestNeighbors

In [34]:
from fuzzywuzzy import process

In [35]:
movies_final= pd.read_csv(r"C:\Users\ditri\Desktop\Important\Data\Movie Lens Data\ml-latest-small\movie_list_content_based.csv")

In [36]:
movies_final.head()

Unnamed: 0,movieId,title,genres,release_year,weighted_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.90408
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,3.437093
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,3.295312
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,3.000876
4,5,Father of the Bride Part II (1995),Comedy,1995,3.138173


In [37]:
unique_genre = sorted(list(set(movies_final['genres'].str.cat(sep="|").split('|'))))
print(unique_genre)

['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [38]:
movies_final['genres'] = movies_final['genres'].str.split('|')

#for getting the genre index to be used later to make multi label vectors
movies_final['genre_index'] = movies_final['genres'].apply(lambda x : [unique_genre.index(element) for element in x])

movies_final.head()

Unnamed: 0,movieId,title,genres,release_year,weighted_rating,genre_index
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995,3.90408,"[1, 2, 3, 4, 8]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995,3.437093,"[1, 3, 8]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995,3.295312,"[4, 14]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995,3.000876,"[4, 7, 14]"
4,5,Father of the Bride Part II (1995),[Comedy],1995,3.138173,[4]


In [39]:
genre_arr = np.zeros((movies_final.shape[0], len(unique_genre)))

In [40]:
for i in range(movies_final.shape[0]):
    genre_arr[i, movies_final['genre_index'].iloc[i]] = 1

In [41]:
scaler1 = RobustScaler()
scaler2 = PowerTransformer(method = 'box-cox')
# scaled_years = np.round(scaler2.fit_transform(movies_final[['release_year']]), decimals = 6)
scaled_years = scaler2.fit_transform(movies_final[['release_year']] / np.max(movies_final['release_year']))

In [42]:
print(np.max(scaled_years), np.min(scaled_years))

1.935903569764593 -2.2614194669381518


In [43]:
# plt.hist(PowerTransformer(method = 'box-cox').fit_transform(movies_final[['release_year']] / np.max(movies_final['release_year'])))
# plt.show()

In [44]:
print(type(scaled_years))
print(scaled_years.shape)

<class 'numpy.ndarray'>
(9704, 1)


In [45]:
movie_id_arr = np.array(movies_final['movieId']).reshape(-1, 1)

In [46]:
features = np.hstack((movie_id_arr, genre_arr, scaled_years))
features.shape

(9704, 21)

In [47]:
def hybrid_distance(x, y):
    x_genre = x[1:-1]
    x_year = x[-1]
    y_genre = y[1:-1]
    y_year = y[-1]

    genre_d = cosine(x_genre, y_genre)
    year_d = abs(x_year - y_year)

    return (2 * weight * genre_d) + (1 - weight) * year_d

In [48]:
def find_similar_movies(query):
    
    # Find the top 3 matches for the query
    series_cleaned = (
    movies_final['title'].str.replace(r'\(\d{4}\)', '', regex=True)   # Remove brackets and 4-digit numbers using regex
                         .str.replace(r'\b(a|an|the)\b', '', regex=True, case=False)  # Remove articles
                         .str.replace(r'[^\w\s\'-]', '', regex=True)  # Remove punctuations except apostrophes and dashes
                         .str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces
                         .str.strip()  # Trim leading/trailing spaces
    )    
    
    top_matches = process.extract(query, series_cleaned.str.lower(), limit=3)
    print("Top 3 matches:")
    # print(top_matches)
    
    for match in top_matches:
        i = match[2]
        print(movies_final.iloc[i]['title'])

    movie_idx = top_matches[int(input('Enter the preferred movie number. For example if you want the second movie press 2 on the keyboard and Enter:')) - 1][2]
    movie = movies_final.iloc[movie_idx]['title']
    print(f"Following are the movies similar to \033[1m{movie}\033[0m :")

    
    
    Id = movies_final['movieId'][movies_final['title'] == movie].iloc[0]
    
    idx = np.where(features[:, 0] == Id)[0][0]
    # print(idx)
    
    distances, indices = knn.kneighbors(features[idx].reshape(1, -1))
    # print(indices)

    df = pd.DataFrame(movies_final.iloc[indices.reshape(-1,)][['title', 'weighted_rating']]).reset_index(drop = True)

    if (df['weighted_rating'] > 4).sum() >= 5 :
        return df.iloc[:10]

    elif (df['weighted_rating'] > 3.5).sum() >= 5 :
        return df.iloc[:10]

    elif (df['weighted_rating'] > 3).sum() > 5 :
        return df.sort_values(by = 'weighted_rating', ascending = False).iloc[:10]

    elif (df['weighted_rating'] > 3).sum() < 1 :
        return df[df['weighted_rating'] > 2.5]

    else :
        print('Do Not Search for Shit Movies')    
    

In [59]:
weight = float(input('Input the weight to assign to the genre. The value of weight is between 0 and 1. Higher weight means MORE Importance :'))

knn = NearestNeighbors(n_neighbors=13, metric=hybrid_distance)
knn.fit(features)

In [67]:
movie_name = input('Enter name of Movie :')
find_similar_movies(movie_name)

Top 3 matches:
Dark Knight, The (2008)
Dark Knight Rises, The (2012)
Batman: The Dark Knight Returns, Part 1 (2012)
Following are the movies similar to [1mDark Knight Rises, The (2012)[0m :


Unnamed: 0,title,weighted_rating
0,"Dark Knight Rises, The (2012)",3.941341
1,"Man with the Iron Fists, The (2012)",3.401401
2,Spectre (2015),3.212077
3,Wrath of the Titans (2012),3.231847
4,"Avengers, The (2012)",3.827103
5,Skyfall (2012),3.762949
6,"Amazing Spider-Man, The (2012)",3.308052
7,John Carter (2012),3.275701
8,Pirates of the Caribbean: On Stranger Tides (2...,3.303455
9,Mission: Impossible - Ghost Protocol (2011),3.613355
