In [60]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [70]:
# GDrive for GCollab
file_path = 'drive/MyDrive/Colab Notebooks/moviedata.csv'
root_path = 'drive/MyDrive/Colab Notebooks/'

# Local File System
# file_path = 'dataset/moviedata.csv'
# root_path = 'dataset/'

In [62]:
df = pd.read_csv(file_path)
df = df[['Title','Genre','Director','Actors']]
df = df.dropna()
df = df.apply(lambda x: x.astype(str).str.lower())
df.tail()

Unnamed: 0,Title,Genre,Director,Actors
995,secret in their eyes,"crime,drama,mystery",billy ray,"chiwetel ejiofor, nicole kidman, julia roberts..."
996,hostel: part ii,horror,eli roth,"lauren german, heather matarazzo, bijou philli..."
997,step up 2: the streets,"drama,music,romance",jon m. chu,"robert hoffman, briana evigan, cassie ventura,..."
998,search party,"adventure,comedy",scot armstrong,"adam pally, t.j. miller, thomas middleditch,sh..."
999,nine lives,"comedy,family,fantasy",barry sonnenfeld,"kevin spacey, jennifer garner, robbie amell,ch..."


In [63]:
genres = set()
directors = set()
actors = set()
for index, row in df.iterrows():
  entry = row['Genre']
  for e in entry.split(','):
    genres.add(e.strip())

  entry = row['Director']
  for e in entry.split(','):
    directors.add(e.strip())
    
  entry = row['Actors']
  for e in entry.split(','):
    actors.add(e.strip())

genres_list = list(genres)
directors_list = list(directors)
actors_list = list(actors)

print("Found %d genres"%len(genres_list))
print("Found %d directors"%len(directors_list))
print("Found %d actors"%len(actors_list))

Found 20 genres
Found 644 directors
Found 1985 actors


In [64]:
data = df.copy()
data=data.set_index(data.columns.drop('Genre',1).tolist()).Genre.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Genre'}).loc[:, data.columns]
data=data.set_index(data.columns.drop('Director',1).tolist()).Director.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Director'}).loc[:, data.columns]
data=data.set_index(data.columns.drop('Actors',1).tolist()).Actors.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Actors'}).loc[:, data.columns]
data['Query'] = data['Genre'].str.cat(data[['Director','Actors']].astype(str), sep=" ")
data.tail()

Unnamed: 0,Title,Genre,Director,Actors,Query
10213,nine lives,family,barry sonnenfeld,cheryl hines,family barry sonnenfeld cheryl hines
10214,nine lives,fantasy,barry sonnenfeld,kevin spacey,fantasy barry sonnenfeld kevin spacey
10215,nine lives,fantasy,barry sonnenfeld,jennifer garner,fantasy barry sonnenfeld jennifer garner
10216,nine lives,fantasy,barry sonnenfeld,robbie amell,fantasy barry sonnenfeld robbie amell
10217,nine lives,fantasy,barry sonnenfeld,cheryl hines,fantasy barry sonnenfeld cheryl hines


In [65]:
data.to_csv(root_path+"movie_data_split.csv")

In [66]:
user_query = "Action James Gunn  Vin Diesel"
user_query

'Action James Gunn  Vin Diesel'

In [67]:
count_vector = CountVectorizer()
queries = data['Query'].tolist()
queries.append(user_query)
count_matrix = count_vector.fit_transform(queries)
similarity_scores = cosine_similarity(count_matrix)

In [68]:
def get_title(index,data):
  return data[data.index == index]['Title'].values[0]

def get_index(title,data):
  return data[data.Title == title].index[0]

In [69]:
movie_index = len(similarity_scores)-1
similar_movies = list(enumerate(similarity_scores[movie_index][:-1]))
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1],reverse=True)
movies = set()
for entry in sorted_similar_movies:
  if entry[1] > 0.7:
    movies.add(get_title(entry[0],data))
print(movies)

{'guardians of the galaxy', 'furious seven'}
