<h1 style="color: #492c68;">01 | BASICS</h1>

<h2 style="color: #327a81;">Libraries</h2>

In [1]:
## Basic libraries

import pandas as pd
import ast #
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

## Settings

pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

In [2]:
# ML settings

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

<h2 style="color: #327a81;">Data Read</h2>

In [3]:
data= pd.read_csv("Netflx_mood_analysis.csv")

In [4]:
data.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,rating,listed_in,n_seasons,movie_lenght,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Not Provided,United States,2021-09-25,PG-13,Documentaries,,90.0,"As her father nears the end of his life, filmm..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       8807 non-null   object 
 1   type          8807 non-null   object 
 2   title         8807 non-null   object 
 3   director      8807 non-null   object 
 4   cast          8807 non-null   object 
 5   country       8807 non-null   object 
 6   date_added    8709 non-null   object 
 7   rating        8807 non-null   object 
 8   listed_in     8807 non-null   object 
 9   n_seasons     2676 non-null   float64
 10  movie_lenght  6128 non-null   float64
 11  description   8807 non-null   object 
dtypes: float64(2), object(10)
memory usage: 825.8+ KB


In [6]:
df = data.copy()

<h1 style="color: #492c68;">02 | SENTIMENT ANALYSIS ML</h1>

- We will use the pretrained model "Emotion English DistilRoBERTa-base" from Hugging Face, a fine-tuned version of RoBERTa (famous NLP model that works searching for relation between language and context) 
- This pipeline will classify the sipnopsis from catalogue in emotions. 

In [7]:
# Load the sentiment analysis Model

MODEL = f"j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [37]:
# Define a function that tokenize all sypnosis, apply the pretrained model and finally obtain the sentiment mood label
# We will save two sentiments per title. Neutral sentiment will be omitted.

def mooder(text):
    encoded_text = tokenizer(text, return_tensors="pt")
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    sc_label = {
    "anger": scores[0],
    "disgust" : scores[1],
    "fear": scores[2],
    "joy": scores[3],
    "neutral": scores[4], 
    "sadness": scores[5],
    "surprise": scores[6]
    }
    sorted_scores = sorted(sc_label.items(), key= lambda item: item[1], reverse=True)
    top_moods = []
    for mood, score in sorted_scores:
        if mood != "neutral":
            top_moods.append(mood)
        if len(top_moods) == 2:
            break
    
    return top_moods

In [38]:
# Create a new column that matches emotion with each entry 

df["mood"] = df["description"].apply(mooder)

In [39]:
# This new column contains a list of labels for each entry, let's transform them into a string of words for better processing

df["mood"] = df["mood"].apply(lambda x: ", ".join(x))

In [40]:
# Check the df updated with moods

df[["title","description","mood"]].sample(5)

Unnamed: 0,title,description,mood
4451,Mumbai Meri Jaan,A moving portrayal of the 2006 train bombings ...,"fear, disgust"
6149,Amy,Rare home videos and interviews with Amy Wineh...,"sadness, fear"
5778,Umrika,An ambitious man leaves his small Indian villa...,"anger, sadness"
8044,Smash: Motorized Mayhem,Buckle up for a look inside a biannual school ...,"fear, sadness"
739,Wish Dragon,Determined teen Din is longing to reconnect wi...,"sadness, surprise"


<h1 style="color: #492c68;">03 | EXPORT DATASET FOR MOOD RECOMMENDER</h1>

In [41]:
# Create a new dataset to build the Netflix Mood Recommender

df.to_csv("netflix_mood_recommender.csv", index=False)