### Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from annoy import AnnoyIndex
import pickle
pd.set_option('display.max_columns', None)

### Loading and creating the training datasets

In [None]:
# dataset_link = "https://www.kaggle.com/datasets/akshaypawar7/millions-of-movies"
data_df = pd.read_csv('../data/movies.csv', encoding='utf-8', low_memory=False)
data_df = data_df[(data_df['title'].notna()) & (data_df['genres'].notna()) & (data_df['original_language'] == 'en') & (data_df['status'] == 'Released') & (data_df['runtime'] > 30) & (data_df['credits'].notna()) & ((data_df['backdrop_path'].notna()) | (data_df['poster_path'].notna()))]
data_df.info()
data_df.head()

In [None]:
data_df = data_df[['id', 'title', 'genres', 'overview', 'tagline', 'credits', 'keywords', 'poster_path', 'backdrop_path', 'recommendations']]
train_df = data_df.drop(columns=['recommendations'])

### Filling the missing values in the dataset

In [None]:
# fill null values
# {'overview': 'missing', 'keywords': 'missing', 'tagline': 'missing'}
train_df = train_df.fillna(value={'overview': 'missing', 'keywords': 'missing', 'tagline': 'missing'}).drop_duplicates()
train_df['poster_path'] = train_df['poster_path'].fillna(train_df['backdrop_path'])

### Modifying the column data for the dataset

In [None]:
train_df.info()

In [None]:
train_df['genres'] = train_df['genres'].str.replace(' ', '')
train_df['tagline'] = train_df['tagline'].str.replace('.', '')
train_df['keywords'] = train_df['keywords'].str.replace(' ', '')
train_df['credits'] = train_df['credits'].str.replace(' ', '')
train_df['title'] = train_df['title'].str.replace(':', '')
train_df.head()

In [None]:
train_df['genres'] = train_df['genres'].str.split('-')
train_df['credits'] = train_df['credits'].str.split('-')
train_df['keywords'] = train_df['keywords'].str.split('-')
train_df['overview'] = train_df['overview'].str.split(' ')
train_df['tagline'] = train_df['tagline'].str.split(' ')
train_df.head()

In [None]:
train_df['genres'].iloc[:5]

In [None]:
train_df['tags'] = train_df['title'].str.split(' ') + train_df['genres'] + train_df['keywords'] + train_df['tagline'] + train_df['credits'] + train_df['overview']
train_df['tags'] = train_df['tags'].apply(lambda x: " ".join(x))

In [None]:
train_df = train_df[['id', 'title', 'tags', 'genres', 'overview', 'tagline', 'credits', 'keywords', 'poster_path', 'backdrop_path']]
train_df.head()

In [None]:
train_x = train_df[['id', 'title', 'tags', 'credits', 'overview', 'poster_path', 'backdrop_path']]
train_x = train_x.drop_duplicates(['title'], keep='first').reset_index(drop=True)
train_x['tags'] = train_x['tags'].str.lower().str.replace(':','')
train_x.head()

### Vectorization of the data and predicting the recommendations

In [None]:
ps = PorterStemmer()
train_x['tags'] = train_x['tags'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))
train_x.head()

In [None]:
cv = CountVectorizer(encoding='utf-8', decode_error='ignore', lowercase=True, max_features=5000, stop_words='english')
vectors = cv.fit_transform(train_x['tags']).toarray()
print(vectors.shape)
cv.get_feature_names_out().tolist()

In [None]:
annoy_index = AnnoyIndex(vectors.shape[1], 'angular')  # 'angular' distance for cosine similarity

# Add movie vectors to the Annoy index
for i, vector in enumerate(vectors):
    annoy_index.add_item(i, vector)

# Build the Annoy index
num_trees = 10
annoy_index.build(num_trees)

In [None]:
def recommend(df, movie_title):
    number_of_recommendations = 10  # Number of nearest neighbors to retrieve
    if movie_title not in df['title'].tolist(): return []
    movie_index = df['title'].tolist().index(movie_title)
    nearest_neighbors = annoy_index.get_nns_by_vector(vectors[movie_index].flatten(), number_of_recommendations, include_distances=True)

    # Display the indices and distances of nearest neighbors
    similar_movie_indices = nearest_neighbors[0]  # Indices of similar movies
    similarity_scores = nearest_neighbors[1]  # Similarity scores (distances)
    print("Similarity scores of [{}]:".format(similar_movie_indices[0]), similarity_scores)
    return df['id'].iloc[similar_movie_indices].tolist()

In [None]:
movie_ids = recommend(train_x,'Iron Man')
train_x[(train_x['id'].isin(movie_ids))]

In [None]:
print(train_x.shape)
train_x.columns.tolist()

In [None]:
train_x['recommendations'] = train_x['title'].apply(lambda x: recommend(df=train_x, movie_title=str(x)))

In [None]:
print(train_x.shape)
print(train_x[(train_x['title'] == 'Iron Man')])
train_x[train_x['id'].isin([1726, 474227, 270768, 408648, 204240,299537])]

### Creating the output data

In [None]:
output_file_path = '../data/recommendations_data.pkl'
pickle_file = open(file=output_file_path,mode='wb')
pickle.dump(obj=train_x, file=pickle_file)
pickle_file.close()