In [None]:
import os
import pickle
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as offline
import plotly.graph_objs as go
import plotly.io as pio
import seaborn as sns
from ast import literal_eval
from Cluster_analysis import *

pd.options.mode.chained_assignment = None

HOVER_TEMPLATE = "<b>%{hovertext}</b><br><br>" + "Movie: %{customdata[0]}<br>" + "Release date: %{customdata[1]}<br>" + "Actor name: %{customdata[2]}<br>" + "Actor age at release: %{customdata[3]}<br>" + "Gender: %{customdata[4]}<br>" + "Character archetype: %{customdata[6]}<br>" + "Description: %{customdata[7]}<br>" + "Genres: %{customdata[8]}<br>" + "Box office revenue: %{customdata[9]}<br>"
TITLES = ['Decision-makers', 'Heroes & anti-heroes', 'Femme fatale', 'Cunning evil', 'Clumsy', 'Virtuous', 'Righteous warrior', 'Benevolent leader', 'Wise mentor', 'Captain', 'Ingenuous', 'Tycoon', 'Ruthless commander', 'Arrogant leader', 'Love interest', 'Reconciliator', 'Adventurous woman', 'Apprentice', 'Young lover', 'Logistician', 'Lawyer', 'Stubborn fool', 'Eccentric villain', 'Marksman', 'Goofy friend', 'Hardworking learner', 'Benevolent', 'Sophisticated psycopath', 'Nemesis', 'Corrupt', 'Good cop', 'Musician', 'Protector', 'Family-oriented']
COLOR_SCALE = px.colors.cyclical.HSV
COLOR_PALETTE = [[round(255*c) for c in color] for color in sns.color_palette("hls", len(TITLES))]

### 1. Pre-processing data 

In [None]:
final_path = 'Data/final_df.csv'

# Load final_df from csv file
df = pd.read_csv(final_path, sep='\t')
df = df.fillna('Not Available')
df['partner'] = df['partner'].replace('Not Available', 'No partner')

# Convert the elements of the partner column that are not 'No partner' to a list
df['partner'] = df['partner'].apply(
    lambda x: x[1:-1].split(',') if x != 'No partner' else x)

# Remove the square brackets from the elements of the partner column that are not 'No partner'
df['partner'] = df['partner'].apply(
    lambda x: [y.strip(' ').strip('\'') for y in x] if x != 'No partner' else x)

# Remove duplicates in the partners column
df['partner'] = df['partner'].apply(lambda x: list(set(x)) if x != 'No partner' else x)

# Convert the descriptions to by evaluating literal
df['descriptions'] = df['descriptions'].apply(lambda x: literal_eval(x) if x != 'Not Available' else x)
df['filtered_descriptions'] = df['filtered_descriptions'].apply(lambda x: literal_eval(x) if x != 'Not Available' else x)

# Label each cluster and compute their centroids
num_clusters = df['labels'].nunique()

# Add colors
df['colors'] = df['labels'].apply(lambda x: COLOR_PALETTE[x])

TODO: **Group characters appearing in multiple movies**

### 2. Full graph

In [None]:
fig = create_cloud(df, save=True)
fig.show()

### 3. Filtered cloud

In [None]:
fig_romgen = create_cat_clouds(df, save=True)
fig_romgen.show()

### 4. Relations

In [None]:
rel_fig = create_rel_cloud(df, save=True)
rel_fig.show()

### 5. Time cloud

In [None]:
fig_time = create_time_cloud(df, save=True)
fig_time.show()


### Word relations

In [None]:
# Concatenate the descriptions of the partners
df['partner_description'] = df['partner'].apply(lambda x: [df[df['Freebase character ID'] == y]['descriptions'].values[0] for y in x] if x != 'No partner' else x)
df['partner_description'] = df['partner_description'].apply(lambda x: [y for y in x if y != 'Not Available'] if x != 'No partner' else x)
df['partner_description'] = df['partner_description'].apply(lambda x: [item for sublist in x for item in sublist] if x != 'No partner' else x)

# For each cluster, concatenate the descriptions of the partners
grouped = df.groupby('labels').aggregate({'partner_description': lambda x: [item for sublist in x for item in sublist if sublist != 'No partner']})

# Get the top 5 most common partner descriptions
#from collections import Counter
#grouped['partner_description'] = grouped['partner_description'].apply(lambda x: [item[0] for item in Counter(x).most_common(5)])

# Save to csv
grouped.to_csv('Data/partner_descriptions.csv', sep='\t')

In [None]:
partner_des = pd.read_csv('Data/partner_descriptions.csv', sep='\t')

In [None]:
df['title'] = df['title'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').split(', '))

# loop over filtered descriptions of a character with a partner, so the column partner is not No partner
dict_description = dict()
for i in range(len(df)):
    if df['partner'][i] != 'No partner' and df['title'][i] != 'Not Available':
        # loop over the partners of the character
        for j in range(len(df['partner'][i])):
            # Get Freebase character ID of the partner
            partner_id = df['partner'][i][j]
            # For each filtered description of the character, add the filtered desription of the partner as a value
            for k in range(len(df['title'][i])):
                # If the description is not already in the dictionary, add it as a key and the partner description as a value
                if df['title'][i][k] not in dict_description.keys():
                    dict_description[df['title'][i][k]] = [df[df['Freebase character ID'] == partner_id]['title'].values[0]]
                # If the description is already in the dictionary, add the partner description as a value
                else:
                    dict_description[df['title'][i][k]].append(df[df['Freebase character ID'] == partner_id]['title'].values[0])


In [None]:
# flatten the lists of values for each key
for key in dict_description.keys():
    dict_description[key] = [item for sublist in dict_description[key] for item in sublist]

In [None]:
#import counter
from collections import Counter

# For each key, get the top 5 most frequent values. Discard the values that are not in the top 5
for key in dict_description.keys():
    # Get the top5 most frequent values per key
    top5 = [item[0] for item in Counter(dict_description[key]).most_common(5)]
    # Discard the values that are not in the top 5
    dict_description[key] = [item for item in dict_description[key] if item in top5]
    
# remove duplicate values for each key
for key in dict_description.keys():
    dict_description[key] = list(set(dict_description[key]))


In [None]:
# Create a dataframe, with a column "Descriptive word" and a column "Partner descriptions"
df_partner_descriptions = pd.DataFrame(columns=['Descriptive word', 'Partner descriptions'])
# Add the descriptive words as the first column
df_partner_descriptions['Descriptive word'] = dict_description.keys()
# Add the partner descriptions as the second column
df_partner_descriptions['Partner descriptions'] = dict_description.values()

# remove rows where the partner descriptions have less than 3 elements
#f_partner_descriptions = df_partner_descriptions[df_partner_descriptions['Partner descriptions'].apply(lambda x: len(x) >= 4)]
df_partner_descriptions

# remove the rows where the descriptive word is Not Available
df_partner_descriptions = df_partner_descriptions[df_partner_descriptions['Descriptive word'] != 'Not Available']
# remove the items "Not Available" from the partner descriptions
df_partner_descriptions['Partner descriptions'] = df_partner_descriptions['Partner descriptions'].apply(lambda x: [item for item in x if item != 'Not Available'])
# remove the rows where the partner descriptions are empty
df_partner_descriptions = df_partner_descriptions[df_partner_descriptions['Partner descriptions'].apply(lambda x: len(x) > 0)]
df_partner_descriptions