In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 3/SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 9: Loading the packages  ####

import os
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error




In [None]:
#=================================================-
#### Slide 10: Loading the packages  ####

from math import sqrt
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate





In [None]:
#=================================================-
#### Slide 11: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path
home_dir = Path(".").resolve()
main_dir = home_dir.parent
data_dir = str(main_dir) + "/data"




In [None]:
#=================================================-
#### Slide 12: Load the dataset and check the structure  ####

# Reading the ratings file.
ratings = pd.read_csv(data_dir+ '/ratings.csv', sep='\t', encoding='latin-1', 
usecols = ['user_id', 'movie_id', 'rating'])
# Reading users file.
users = pd.read_csv(data_dir+ '/users.csv', sep='\t', encoding='latin-1', 
usecols = ['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
# Reading movies file.
movies = pd.read_csv(data_dir+ '/movies.csv', sep='\t', encoding='latin-1', 
usecols = ['movie_id', 'title', 'genres'])
print(ratings.info())




In [None]:
#=================================================-
#### Slide 13: Load the dataset and check the structure  ####

print(users.info())
print(movies.info())




In [None]:
#=================================================-
#### Slide 14: View the head of the dataset  ####

print(ratings.head(3))

print(users.head(3))

print(movies.head(3))




In [None]:
#=================================================-
#### Slide 15: Movies - data exploration  ####

# Create a word cloud of the movie titles.
movies['title'] = movies['title'].fillna("").astype('str')
title_corpus = ' '.join(movies['title'])
title_wordcloud = WordCloud(stopwords = STOPWORDS, background_color = 'black', 
height = 2000, width = 4000).generate(title_corpus)

# Plot the word cloud.
plt.figure(figsize = (16, 8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()




In [None]:
#=================================================-
#### Slide 16: Ratings - data exploration  ####

# Get summary statistics of ratings.
print(ratings['rating'].describe())
sns.set_style('whitegrid')
sns.set(font_scale=1.5)

# Display distribution of ratings.
sns.countplot(ratings['rating'])




In [None]:
#=================================================-
#### Slide 17: Combining dataframes  ####

# Join all 3 files into one dataframe.
dataset = pd.merge(pd.merge(movies, ratings), users)

# Display 5 movies with highest ratings.
print(dataset[['title', 'genres', 'rating']].sort_values('rating', ascending = False).head(5))
# Make a census of the genre keywords.
genre_labels = set()
for s in movies['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))




In [None]:
#=================================================-
#### Slide 18: Function to count the genres  ####

# Create a function that counts the number of times each of the genre keywords appear.
def count_word(dataset, ref_col, census):
    keyword_count = dict()
    for s in census: 
        keyword_count[s] = 0
    for census_keywords in dataset[ref_col].str.split('|'):        
        if type(census_keywords) == float and pd.isnull(census_keywords): 
            continue        
        for s in [s for s in census_keywords if s in census]: 
            if pd.notnull(s): 
                keyword_count[s] += 1
    # Convert the dictionary in a list to sort the keywords by frequency.
    keyword_occurrences = []
    for k,v in keyword_count.items():
        keyword_occurrences.append([k,v])
    keyword_occurrences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurrences, keyword_count




In [None]:
#=================================================-
#### Slide 19: Function to count the genres  ####

# Calling this function gives access to a list of genre keywords, which are sorted by decreasing frequency.
keyword_occurrences, dum = count_word(movies, 'genres', genre_labels)
print(keyword_occurrences[:5])




In [None]:
#=================================================-
#### Slide 21: Exercise 1  ####






In [None]:
#=================================================-
#### Slide 29: Content-based recommender implementation  ####

# Break up the big genre string into a string array.
movies['genres'] = movies['genres'].str.split('|')

# Convert genres to string values.
movies['genres'] = movies['genres'].fillna("").astype('str')
print(movies['genres'].head())




In [None]:
#=================================================-
#### Slide 30: Content-based recommender implementation - cont'd  ####

tf = TfidfVectorizer(analyzer = 'word',
ngram_range = (1, 2),
min_df = 0, 
stop_words = 'english')

tfidf_matrix = tf.fit_transform(movies['genres'])
print(tfidf_matrix.shape)




In [None]:
#=================================================-
#### Slide 31: Content-based recommender implementation - cont'd  ####

# Cosine similarity for all movies, and look at the first four rows and columns.
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim[:4, :4])

print(cosine_sim.shape)
# Build a 1-dimensional array with movie titles.
titles = movies['title']
indices = pd.Series(movies.index, index = movies['title'])
print(titles[0:5])




In [None]:
#=================================================-
#### Slide 32: Content-based recommender implementation - cont'd  ####

# Function that get movie recommendations based on the cosine similarity score of movie genres.
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]




In [None]:
#=================================================-
#### Slide 33: Content based recommender implementation - cont'd  ####

print(genre_recommendations('Toy Story (1995)').head(20))




In [None]:
#=================================================-
#### Slide 35: Generate content-based recommendation  ####

print(genre_recommendations('Assassins (1995)').head(20))




In [None]:
#=================================================-
#### Slide 36: Generate content-based recommendation  ####

print(genre_recommendations('Sense and Sensibility (1995)').head(20))


