In [1]:
import os

os.environ['TRANSFORMERS_CACHE'] = './hugging_face_models/cache/'
os.environ['TORCH_HOME'] = './hugging_face_models/cache/'
os.environ['SENTENCE_TRANSFORMERS_HOME'] = './hugging_face_models/cache/'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import warnings

warnings.filterwarnings('ignore')
import ast
import re
import math
from scipy.spatial.distance import cosine

In [3]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Reading main data

In [160]:
df_main = pd.read_csv('../data/movies_metadata.csv',error_bad_lines=False)

In [161]:
df_main.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25049
title                       10
video                        6
vote_average                 6
vote_count                   6
Unnamed: 24              45466
Unnamed: 25              45465
Unnamed: 26              45465
Unnamed: 27              45465
Unnamed: 28              45465
dtype: int64

In [162]:
print(f'No of records in the dataset are {df_main.shape[0]}')
print(f'Unique movies in the dataset are {df_main.id.nunique()}')

No of records in the dataset are 45466
Unique movies in the dataset are 45436


In [163]:
# taking only unique movies and removing the unnamed columns

df_main = df_main.drop_duplicates(subset=['id'])
df_main.drop(['Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28'],axis=1,inplace=True)

print(f'No of records and unique movies are {df_main.shape[0], df_main.id.nunique()}')

No of records and unique movies are (45436, 45436)


In [164]:
## remove bad ids (some dates were present in the ids)

df_main['id'] = df_main['id'].astype(str).apply(lambda x:-999 if '-' in x else x).astype(int)

print(df_main['id'].value_counts().to_dict()[-999])


df_main = df_main[df_main['id']!=-999].reset_index(drop=True)

df_main.shape

3


(45433, 24)

## Reading metadata about movies

In [165]:
credits = pd.read_csv('../data/credits.csv')
keywords = pd.read_csv('../data/keywords.csv')
ratings = pd.read_csv('../data/ratings.csv')

print(credits.shape,keywords.shape,ratings.shape)

(45476, 3) (46419, 2) (26024289, 4)


In [166]:
# Ratings dataframe contains multiple user reviews for every movie, so aggregating them into a dictionary

movie_agg_rating = ratings.groupby('movieId')['rating'].median().to_dict()

In [167]:
# combining all information into a single dataframe

print(df_main.shape)

df_total = pd.merge(df_main,credits,how='left',on='id')
df_total = pd.merge(df_total,keywords,how='left',on='id')

print(df_total.shape)

(45433, 24)
(46497, 27)


In [169]:
df_total = df_total.drop_duplicates(subset=['id'])
print(df_total.shape[0],df_total['id'].nunique())

45433 45433


In [187]:
df_total['cast'].fillna('[]',inplace=True)
df_total['crew'].fillna('[]',inplace=True)
df_total['keywords'].fillna('[]',inplace=True)
df_total['overview'].fillna('',inplace=True)

df_total.isnull().sum()

adult                        0
belongs_to_collection    40945
budget                       0
genres                       0
homepage                 37659
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                     0
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25027
title                        7
video                        3
vote_average                 3
vote_count                   3
cast                         0
crew                         0
keywords                     0
cast_members                 0
creative_members             0
agg_movie_rating         37868
plot_keywords                0
dtype: int64

## Feature Engineering

In [171]:
# getting the cast and creative team information from the credits dataframe

def return_film_cast(cast_json):
    cast_json = ast.literal_eval(cast_json)
    cast_members = []
    for json in cast_json:
        if 'name' in json.keys():
            cast_members.append(json['name'])
    return cast_members

def return_film_creative_team(crew_json):
    crew_json = ast.literal_eval(crew_json)
    creative_filters = ['Directing','Writing','Editing']
    creative_members = []
    for json in crew_json:
        if 'department' in json.keys():
            dept = json['department']
            if dept in creative_filters:
                if 'name' in json.keys():
                    creative_members.append(json['name'])
    return creative_members

In [172]:
df_total['cast_members'] = df_total['cast'].apply(return_film_cast)
df_total['creative_members'] = df_total['crew'].apply(return_film_creative_team)
df_total['agg_movie_rating'] = df_total['id'].map(movie_agg_rating)

In [173]:
# get all plot keywords for the movie
def return_film_plot_keywords(plot_json):
    plot_json = ast.literal_eval(plot_json)
    plot_keywords = []
    for json in plot_json:
        if 'name' in json.keys():
            plot_keywords.append(json['name'])
    
    return plot_keywords

df_total['plot_keywords'] = df_total['keywords'].apply(return_film_plot_keywords)

In [3]:
# Getting sentence embeddings from sentence transformers

def return_preprocessed_text(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ",text)
    text = re.sub("(\s)+"," ",text)
    text = text.strip()
    return text


def return_sentence_embeddings(text):
    vector = sentence_transformer.encode(text)
    return vector

df_total['overview_cleaned'] = df_total['overview'].apply(return_preprocessed_text)
df_total['overview_embeddings'] = df_total['overview_cleaned'].apply(return_sentence_embeddings)
df_total.head()

In [192]:
df_total.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords',
       'cast_members', 'creative_members', 'agg_movie_rating', 'plot_keywords',
       'overview_cleaned', 'overview_embeddings'],
      dtype='object')

## Continue

In [4]:
import joblib

# joblib.dump(df_total,open('../data/df_intermediate_features.pkl','wb'))

df_total = joblib.load(open('../data/df_intermediate_features.pkl','rb'))
df_total.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_count,cast,crew,keywords,cast_members,creative_members,agg_movie_rating,plot_keywords,overview_cleaned,overview_embeddings
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,5415,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",4.0,"[jealousy, toy, boy, friendship, friends, riva...",led by woody andy s toys live happily in his r...,"[0.044583935, 0.047805555, -0.0029498255, -0.0..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,2413,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Jonathan Hensleigh, Joe Johnston, Robert Dalv...",4.0,"[board game, disappearance, based on children'...",when siblings judy and peter discover an encha...,"[0.044435084, -0.021271866, -0.041723467, 0.04..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,92,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",,"[fishing, best friend, duringcreditsstinger, o...",a family wedding reignites the ancient feud be...,"[-0.02628604, 0.02050111, -0.008671549, 0.0072..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,34,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Terry McMillan,...",,"[based on novel, interracial relationship, sin...",cheated on mistreated and stepped on the women...,"[0.047403567, 0.05893014, -0.008000421, -0.029..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,173,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[Nancy Meyers, Albert Hackett, Charles Shyer, ...",,"[baby, midlife crisis, confidence, aging, daug...",just when george banks has recovered from his ...,"[-0.018180072, -0.017848901, -0.013282082, -0...."


In [5]:
def return_preprocessed_text_lists(l_text):
    result = []
    for text in l_text:
        text = text.lower()
        text = re.sub("[^a-z0-9]"," ",text)
        text = re.sub("(\s)+"," ",text)
        text = text.strip()
        result.append(text)
    
    return result

def extract_names_from_list(generic_list):
    result = []
    try:
        generic_list = ast.literal_eval(generic_list)
        for response in generic_list:
            if 'name' in response.keys():
                result.append(response['name'])
        return result
    except:
        return result
    

def extract_released_year(value):
    year = 0
    try:
        datetime = pd.to_datetime(value)
        year = datetime.year
        return int(year)
    except:
        return year
    
def fix_runtime(value):
    try:
        return int(value)
    except:
        return -1
    
def fix_vote_average(value):
    try:
        return float(value)
    except:
        return -1

In [6]:
df_total['cast_members'] = df_total['cast_members'].apply(return_preprocessed_text_lists)
df_total['creative_members'] = df_total['creative_members'].apply(return_preprocessed_text_lists)
df_total['plot_keywords'] = df_total['plot_keywords'].apply(return_preprocessed_text_lists)

df_total['genres'] = df_total['genres'].apply(extract_names_from_list)
df_total['genres'] = df_total['genres'].apply(return_preprocessed_text_lists)

df_total['production_companies'] = df_total['production_companies'].apply(extract_names_from_list)
df_total['production_companies'] = df_total['production_companies'].apply(return_preprocessed_text_lists)

df_total['released_year'] = df_total['release_date'].apply(extract_released_year).fillna(-1).astype(int)

df_total['runtime'].fillna(-1,inplace=True)
df_total['vote_average'].fillna(-1,inplace=True)

df_total['runtime'] = df_total['runtime'].apply(fix_runtime).astype(int)
df_total['vote_average'] = df_total['vote_average'].apply(fix_vote_average)

print(df_total.shape)
df_total['title'].fillna('',inplace=True)
df_total = df_total[df_total['title'] != ''].reset_index(drop=True)

df_total['title_clean'] = df_total['title'].apply(return_preprocessed_text)
print(df_total.shape)

(45433, 34)
(45426, 35)


In [7]:
df_total.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords',
       'cast_members', 'creative_members', 'agg_movie_rating', 'plot_keywords',
       'overview_cleaned', 'overview_embeddings', 'released_year',
       'title_clean'],
      dtype='object')

In [8]:
df_total[['title','title_clean','adult','genres','production_companies','released_year','runtime','vote_average','cast_members','creative_members','plot_keywords','overview_embeddings']].head()

Unnamed: 0,title,title_clean,adult,genres,production_companies,released_year,runtime,vote_average,cast_members,creative_members,plot_keywords,overview_embeddings
0,Toy Story,toy story,False,"[animation, comedy, family]",[pixar animation studios],1995,81,7.7,"[tom hanks, tim allen, don rickles, jim varney...","[john lasseter, joss whedon, andrew stanton, j...","[jealousy, toy, boy, friendship, friends, riva...","[0.044583935, 0.047805555, -0.0029498255, -0.0..."
1,Jumanji,jumanji,False,"[adventure, fantasy, family]","[tristar pictures, teitler film, interscope co...",1995,104,6.9,"[robin williams, jonathan hyde, kirsten dunst,...","[jonathan hensleigh, joe johnston, robert dalv...","[board game, disappearance, based on children ...","[0.044435084, -0.021271866, -0.041723467, 0.04..."
2,Grumpier Old Men,grumpier old men,False,"[romance, comedy]","[warner bros, lancaster gate]",1995,101,6.5,"[walter matthau, jack lemmon, ann margret, sop...","[howard deutch, mark steven johnson, mark stev...","[fishing, best friend, duringcreditsstinger, o...","[-0.02628604, 0.02050111, -0.008671549, 0.0072..."
3,Waiting to Exhale,waiting to exhale,False,"[comedy, drama, romance]",[twentieth century fox film corporation],1995,127,6.1,"[whitney houston, angela bassett, loretta devi...","[forest whitaker, ronald bass, terry mcmillan,...","[based on novel, interracial relationship, sin...","[0.047403567, 0.05893014, -0.008000421, -0.029..."
4,Father of the Bride Part II,father of the bride part ii,False,[comedy],"[sandollar productions, touchstone pictures]",1995,106,5.7,"[steve martin, diane keaton, martin short, kim...","[nancy meyers, albert hackett, charles shyer, ...","[baby, midlife crisis, confidence, aging, daug...","[-0.018180072, -0.017848901, -0.013282082, -0...."


In [9]:
df_total[['title','title_clean','adult','genres','production_companies','released_year','runtime','vote_average','cast_members','creative_members','plot_keywords','overview_embeddings']].isnull().sum()

title                   0
title_clean             0
adult                   0
genres                  0
production_companies    0
released_year           0
runtime                 0
vote_average            0
cast_members            0
creative_members        0
plot_keywords           0
overview_embeddings     0
dtype: int64

In [10]:
df_features = df_total[['title','title_clean','adult','genres','production_companies','released_year','runtime','vote_average','cast_members','creative_members','plot_keywords','overview_embeddings']]
df_features.shape

(45426, 12)

In [11]:
def compute_jaccard_similarity(list1, list2):
    if isinstance(list1,str):
        list1 = list1.split(' ')
    if isinstance(list2,str):
        list2 = list2.split(' ')
        
    intersection = np.intersect1d(list1,list2)
    union = np.union1d(list1, list2)
    try:
        score = len(intersection)/len(union)
        return score*100
    except:
        return 0

def compute_film_rating_similarity(rating1,rating2):
    if rating1 == rating2:
        return 100
    else:
        return 0
    

def compute_closeness_score(value1,value2):
    if value1 == -1 or value2 == -1:
        return 0
    
    try:
        ratio = value1/value2 if value1 < value2 else value2/value1
        return ratio*100
        
    except:
        return 100


def return_normalized_values(series):
    minimum = series.min()
    maximum = series.max()
    diff = maximum - minimum
    new_series = series.apply(lambda x:((x-minimum)*100)/diff)
    return new_series


**Features**
1. Film Rating - Same or not
2. Genres - Jaccard Score
3. Production Companies - Jaccard Score
4. Release Date - Closer date high score
5. Runtime - Close runtime high score
6. Vote Average - Close avg high score
7. Cast Members - Jaccard Score
8. Creative Members - Jaccard Score
9. Movie Plot keywords - Jaccard Score
10. Overview Embeddings - Cosine Similarity
11. Title - Jaccard Score

**Next steps**<br>
Write the code and test it on a sample data. Run the pipeline on the whole dataset after successful run on the sample.'

**Checks**<br>
Ensure to check for missing values in the certain columns like runtime, vote average and released year

In [23]:
# sample = df_features.loc[:50]
# sample.shape

print(f'Total shape is {df_features.shape}')
df_features_2013 = df_features[df_features['released_year'] > 2012].reset_index(drop=True)
print(f'Total shape after taking movies post 2012 {df_features_2013.shape}')

Total shape is (45426, 12)
Total shape after taking movies post 2012 (7904, 12)


In [24]:
from tqdm import tqdm

In [None]:
movies_to_movies_score_mapper = {}
recommendatation_model = {}

def initialize_dictionary(movie1,movie2,title_score,genre_score,production_companies_score,
                          cast_score,creative_score,plot_keyword_score,film_rating_score,
                          released_year_score,runtime_score,vote_avg_score,plot_similarity_score,final_score):
    
    if movie1 not in movies_to_movies_score_mapper.keys():
        movies_to_movies_score_mapper[movie1] = {}
    
    if movie2 not in movies_to_movies_score_mapper[movie1].keys():
        movies_to_movies_score_mapper[movie1][movie2] = {}
    
    movies_to_movies_score_mapper[movie1][movie2]['title_score'] = title_score
    movies_to_movies_score_mapper[movie1][movie2]['genre_score'] = genre_score
    movies_to_movies_score_mapper[movie1][movie2]['production_companies_score'] = production_companies_score
    movies_to_movies_score_mapper[movie1][movie2]['cast_score'] = cast_score
    movies_to_movies_score_mapper[movie1][movie2]['creative_score'] = creative_score
    movies_to_movies_score_mapper[movie1][movie2]['plot_keyword_score'] = plot_keyword_score
    movies_to_movies_score_mapper[movie1][movie2]['film_rating_score'] = film_rating_score
    movies_to_movies_score_mapper[movie1][movie2]['released_year_score'] = released_year_score
    movies_to_movies_score_mapper[movie1][movie2]['runtime_score'] = runtime_score
    movies_to_movies_score_mapper[movie1][movie2]['vote_avg_score'] = vote_avg_score
    movies_to_movies_score_mapper[movie1][movie2]['plot_similarity_score'] = plot_similarity_score
    movies_to_movies_score_mapper[movie1][movie2]['final_score'] = final_score
    

for index,row in tqdm(df_features_2013.iterrows()):
    temporary_dictionary = {}
    movie1 = row['title']
    for index2, row2 in df_features_2013.iloc[index+1:].iterrows():
        
        title_score = compute_jaccard_similarity(row['title_clean'],row2['title_clean'])
        genre_score = compute_jaccard_similarity(row['genres'],row2['genres'])
        production_companies_score = compute_jaccard_similarity(row['production_companies'],row2['production_companies'])
        cast_score = compute_jaccard_similarity(row['cast_members'],row2['cast_members'])
        creative_score = compute_jaccard_similarity(row['creative_members'],row2['creative_members'])
        plot_keyword_score = compute_jaccard_similarity(row['plot_keywords'],row2['plot_keywords'])
        
        film_rating_score = compute_film_rating_similarity(row['adult'],row2['adult'])
        released_year_score = compute_closeness_score(row['released_year'],row2['released_year'])
        runtime_score = compute_closeness_score(row['runtime'],row2['runtime'])
        vote_avg_score = compute_closeness_score(row['vote_average'],row2['vote_average'])
        
        plot_similarity_score = (1-cosine(row['overview_embeddings'],row2['overview_embeddings']))*100
        
        movie2 = row2['title']
        
        final_score = plot_similarity_score*0.3 + 0.1*genre_score + 0.1*cast_score + 0.1*creative_score 
        + 0.1*plot_keyword_score + 0.2*title_score + 0.02*vote_avg_score +0.02*production_companies_score
        + 0.02*film_rating_score + 0.02*released_year_score + 0.02*runtime_score
        
        temporary_dictionary[movie2] = final_score
        
        initialize_dictionary(movie1,movie2,title_score,genre_score,production_companies_score,
                          cast_score,creative_score,plot_keyword_score,film_rating_score,
                          released_year_score,runtime_score,vote_avg_score,plot_similarity_score,final_score)
                
        initialize_dictionary(movie2,movie1,title_score,genre_score,production_companies_score,
                          cast_score,creative_score,plot_keyword_score,film_rating_score,
                          released_year_score,runtime_score,vote_avg_score,plot_similarity_score,final_score)
    
    
    temporary_dictionary = sorted(temporary_dictionary.items(),key=lambda x:x[1],reverse=True)[:3]
    recommendatation_model[movie1] = temporary_dictionary
    

2519it [2:17:53,  2.60s/it]

In [None]:
recommendatation_model

In [None]:
df_total.columns

In [None]:
df_total.columns

In [None]:
def depth_limited_search(recommendation_model, source_movie, limit=2):
    
    movie_stack = []
    movie_stack.append(source_movie)
    depth = 0
    depth_stack = []
    depth_stack.append(depth)
    recommendations = []
    
    while(len(movie_stack)!=0):
        
        movie_popped = movie_stack.pop()
        curr_depth = depth_stack.pop()

        if curr_depth > limit or movie_popped in recommendations:
            continue
            
        movie_neighbours = [t[0] for t in recommendation_model[movie_popped]]
        depth = curr_depth + 1
        depth_stack = depth_stack + [depth]*len(movie_neighbours)
        movie_stack = movie_stack + movie_neighbours
        recommendations.append(movie_popped)
    
    recommendations.remove(source_movie)
    return recommendations

In [None]:
depth_limited_search(recommendatation_model,'Airlift')

In [None]:
dump the recommendation model

joblib.dump(recommendatation_model, open('../data/recommendation_model.pkl','wb'))