In [1]:
import re
import pandas as pd
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
from collections import defaultdict

In [2]:
def get_classifier():
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    classifier = pipeline('sentiment-analysis', model=model_name, tokenizer = tokenizer)
    return classifier

In [4]:
classifier = get_classifier()

In [5]:
def get_dic_classifier_stars_genre(data):
    data1 = data.applymap(lambda x: x[:512])
    data2 = data1.applymap(lambda x: classifier(x) if x != 'None' else 'None')
    data3 = data2.applymap(lambda x: x[0]['label'][:1] if x != 'None' else 'None')
    data4 = data3.applymap(lambda x: int(x) if x != 'None' else 0)
    lista_films = data.columns.to_list()
    list_mean_evaluation_stars = [(data4[film_name].sum())/(len(data4[film_name]) -(len(data4[data4[film_name]==0])))if (len(data4[film_name]) -(len(data4[data4[film_name]==0]))) > 0 else 0 for film_name in lista_films]
    list_mean_evaluation_stars = [x.round(1) if x != 0 else 0 for x in list_mean_evaluation_stars]
    dic = dict(zip(lista_films, list_mean_evaluation_stars))
    return dic

In [6]:
def get_number_comments(df):
    list_film_names = df.columns.to_list()
    list_number_comments = [100 - len(df[df[film_name]=='None']) for film_name in list_film_names]
    dic = dict(zip(list_film_names, list_number_comments))
    return dic

In [7]:
def get_star_evaluation_user_dic(df_genre_stars, dic_number_comments ):
    df = df_genre_stars.applymap(lambda x: re.sub('\[','', x))
    df = df.applymap(lambda x : re.sub('\]', '', x))
    df = df.applymap(lambda x: list(x.split(",")))
    l = [df[film].to_list() for film in dic_number_comments.keys()]
    k = [[item for sublist in r for item in sublist] for r in l ]
    list_star_evaluation_users = [f[0:dic_number_comments[film]] for f, film in zip(k,dic_number_comments.keys())]
    list_star_evaluation_users1 = [[float(x) if x != '' else 0 for x in t] for t in list_star_evaluation_users]
    list_mean_evaluation_stars = [(sum(x)/len(x)) for x in list_star_evaluation_users1]
    list_mean_evaluation_stars = [round(x,1) if x != 0 else 0 for x in list_mean_evaluation_stars]
    dic = dict(zip(list(dic_number_comments.keys()), list_mean_evaluation_stars))
    return dic

In [8]:
def create_df_classifier_user_number(dic_genre_critics, dic_star_evaluation_user_genre, dic_genre_number_comments):
    df1 = pd.DataFrame(list(dic_genre_critics.items()),columns = ['film_name','stars_classifier']) 
    df2 = pd.DataFrame(list(dic_star_evaluation_user_genre.items()),columns = ['film_name1','stars_user']) 
    df3 = pd.DataFrame(list(dic_genre_number_comments.items()), columns = ['film_name2', 'number_of_comments'])
    df4 = pd.concat([df1, df2, df3], axis = 1)
    df5 = df4.drop(columns = ['film_name1', 'film_name2'])
    return df5

In [9]:
def get_lists_same_film_names_genre(dic_film_number_genre_ge, dic_film_number_genre_sp):
    dd = defaultdict(list)
    for d in (dic_film_number_genre_ge, dic_film_number_genre_sp): 
        for key, value in d.items():
            dd[key].append(value)
    a = dict(dd)
    y = list(a.values())
    o = [w for w in y if len(w) >1]
    list_ge = []
    list_sp = []
    for w in o:
        list_ge.append(w[0])
        list_sp.append(w[1])
    return list_ge, list_sp

In [56]:
list_drama_ge, list_drama_sp = get_lists_same_film_names_genre(dic_film_number_drama_ge, dic_film_number_drama_sp)

In [127]:
list_comedy_ge, list_comedy_sp = get_lists_same_film_names_genre(dic_film_number_comedy_ge, dic_film_number_comedy_sp)

In [47]:
list_action_ge, list_action_sp = get_lists_same_film_names_genre(dic_film_number_action_ge, dic_film_number_action_sp)

In [10]:
def get_df_same_film_names_genre(list_genre, dic_genre_critics, dic_star_evaluation_user_genre, dic_genre_number_comments):
    dic_de = dict(zip(list_genre, range(0, len(list_genre))))
    dd = defaultdict(list)
    for d in (dic_de, dic_genre_critics, dic_star_evaluation_user_genre, dic_genre_number_comments): 
        for key, value in d.items():
            dd[key].append(value)
    q = dict(dd)
    comp_df_ge = pd.DataFrame(list(q.items()), columns = ['film_name', 'numbers'])
    comp_df_ge = comp_df_ge.head(len(list_genre))
    comp_df_ge[['number', 'stars_classifier', 'stars_user', 'number_comments']] = pd.DataFrame(comp_df_ge.numbers.tolist(), index = comp_df_ge.index)
    comp_df_ge = comp_df_ge.drop(columns = ['number', 'numbers'])
    return comp_df_ge

In [32]:
df_drama_ge = pd.read_csv('./df_evaluations_genre/drama_ge')

In [33]:
df_comedy_ge = pd.read_csv('./df_evaluations_genre/comedy_ge')

In [34]:
df_action_ge = pd.read_csv('./df_evaluations_genre/action_ge')

In [36]:
df_drama_sp = pd.read_csv('./df_evaluations_genre/drama_sp')

In [37]:
df_comedy_sp = pd.read_csv('./df_evaluations_genre/comedy_sp')

In [38]:
df_action_sp = pd.read_csv('./df_evaluations_genre/action_sp')

In [114]:
dic_drama_ge_critics = get_dic_classifier_stars_genre(df_drama_ge)

In [116]:
a_file = open("./df_evaluations_genre/dic_drama_ge_critics", "w") 
json.dump(dic_drama_ge_critics, a_file)
a_file.close()

In [17]:
 with open("./df_evaluations_genre/dic_drama_ge_critics", "r") as fp:
        dic_drama_ge_critics = json.load(fp)

In [37]:
dic_comedy_ge_critics = get_dic_classifier_stars_genre(df_comedy_ge)

In [None]:
a_file = open("./df_evaluations_genre/dic_comedy_ge_critics", "w") 
json.dump(dic_comedy_ge_critics, a_file)
a_file.close()

In [18]:
with open("./df_evaluations_genre/dic_comedy_ge_critics", "r") as fp:
        dic_comedy_ge_critics = json.load(fp)

In [25]:
dic_action_ge_critics = get_dic_classifier_stars_genre(df_action_ge)

In [26]:
a_file = open("./df_evaluations_genre/dic_action_ge_critics", "w") 
json.dump(dic_action_ge_critics, a_file)
a_file.close()

In [19]:
with open("./df_evaluations_genre/dic_action_ge_critics", "r") as fp:
        dic_action_ge_critics = json.load(fp)

In [48]:
dic_drama_sp_critics = get_dic_classifier_stars_genre(df_drama_sp)

In [50]:
a_file = open("./df_evaluations_genre/dic_drama_sp_critics", "w") 
json.dump(dic_drama_sp_critics, a_file)
a_file.close()

In [62]:
with open("./df_evaluations_genre/dic_drama_sp_critics", "r") as fp:
        dic_drama_sp_critics = json.load(fp)

In [25]:
dic_comedy_sp_critics = get_dic_classifier_stars_genre(df_comedy_sp)

In [26]:
a_file = open("./df_evaluations_genre/dic_comedy_sp_critics", "w") 
json.dump(dic_comedy_sp_critics, a_file)
a_file.close()

In [51]:
with open("./df_evaluations_genre/dic_comedy_sp_critics", "r") as fp:
        dic_drama_sp_critics = json.load(fp)

In [32]:
dic_action_sp_critics = get_dic_classifier_stars_genre(df_action_sp)

In [10]:
a_file = open("./df_evaluations_genre/dic_action_sp_critics", "w") 
json.dump(dic_action_sp_critics, a_file)
a_file.close()

In [20]:
with open("./df_evaluations_genre/dic_action_sp_critics", "r") as fp:
        dic_action_sp_critics = json.load(fp)

In [29]:
dic_drama_number_comments_ge = get_number_comments(df_drama_ge)

In [30]:
dic_comedy_number_comments_ge = get_number_comments(df_comedy_ge)

In [39]:
dic_action_number_comments_ge = get_number_comments(df_action_ge)

In [40]:
dic_drama_number_comments_sp = get_number_comments(df_drama_sp)

In [41]:
dic_comedy_number_comments_sp = get_number_comments(df_comedy_sp)

In [42]:
dic_action_number_comments_sp = get_number_comments(df_action_sp)

In [43]:
dic_star_evaluation_user_drama_ge = get_star_evaluation_user_dic(df_drama_user_stars_ge, dic_drama_number_comments_ge)

In [44]:
dic_star_evaluation_user_drama_sp = get_star_evaluation_user_dic(df_drama_user_stars_sp, dic_drama_number_comments_sp)

In [45]:
dic_star_evaluation_user_comedy_ge = get_star_evaluation_user_dic(df_comedy_user_stars_ge, dic_comedy_number_comments_ge)

In [46]:
dic_star_evaluation_user_comedy_sp = get_star_evaluation_user_dic(df_comedy_user_stars_sp, dic_comedy_number_comments_sp)

In [47]:
dic_star_evaluation_user_action_ge = get_star_evaluation_user_dic(df_action_user_stars_ge, dic_action_number_comments_ge)

In [48]:
dic_star_evaluation_user_action_sp = get_star_evaluation_user_dic(df_action_user_stars_sp, dic_action_number_comments_sp)

In [22]:
df_drama_user_stars_ge = pd.read_csv('./df_evaluations_genre/drama_stars_ge')

In [23]:
df_drama_user_stars_sp = pd.read_csv('./df_evaluations_genre/drama_stars_sp')

In [24]:
df_comedy_user_stars_ge = pd.read_csv('./df_evaluations_genre/comedy_stars_ge')

In [25]:
df_comedy_user_stars_sp = pd.read_csv('./df_evaluations_genre/comedy_stars_sp')

In [26]:
df_action_user_stars_ge = pd.read_csv('./df_evaluations_genre/action_stars_ge')

In [27]:
df_action_user_stars_sp = pd.read_csv('./df_evaluations_genre/action_stars_sp')

In [49]:
df_drama_ge_classifier_user_number = create_df_classifier_user_number(dic_drama_ge_critics, dic_star_evaluation_user_drama_ge, dic_drama_number_comments_ge)

In [24]:
f = 'df_drama_ge_classifier_user_number'
df_drama_ge_classifier_user_number.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [52]:
df_drama_sp_classifier_user_number = create_df_classifier_user_number(dic_drama_sp_critics, dic_star_evaluation_user_drama_sp, dic_drama_number_comments_sp)

In [55]:
f = 'df_drama_sp_classifier_user_number'
df_drama_sp_classifier_user_number.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [84]:
df_comedy_ge_classifier_user_number = create_df_classifier_user_number(dic_comedy_ge_critics, dic_star_evaluation_user_comedy_ge, dic_comedy_number_comments_ge)

In [124]:
f = 'df_comedy_ge_classifier_user_number'
df_comedy_ge_classifier_user_number.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [88]:
df_comedy_sp_classifier_user_number = create_df_classifier_user_number(dic_comedy_sp_critics, dic_star_evaluation_user_comedy_sp, dic_comedy_number_comments_sp)

In [123]:
f = 'df_comedy_sp_classifier_user_number'
df_comedy_sp_classifier_user_number.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [68]:
df_action_ge_classifier_user_number = create_df_classifier_user_number(dic_action_ge_critics, dic_star_evaluation_user_action_ge, dic_action_number_comments_ge)

In [70]:
f = 'df_action_ge_classifier_user_number'
df_action_ge_classifier_user_number.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [66]:
df_action_sp_classifier_user_number = create_df_classifier_user_number(dic_action_sp_critics, dic_star_evaluation_user_action_sp, dic_action_number_comments_sp)

In [67]:
f = 'df_action_sp_classifier_user_number'
df_action_sp_classifier_user_number.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [53]:
with open("./dic_film_number/dic_film_number_drama_ge", "r") as fp:
        dic_film_number_drama_ge = json.load(fp)

In [54]:
with open("./dic_film_number/dic_film_number_drama_sp", "r") as fp:
        dic_film_number_drama_sp = json.load(fp)

In [34]:
with open("./dic_film_number/dic_film_number_comedy_ge", "r") as fp:
        dic_film_number_comedy_ge = json.load(fp)

In [35]:
with open("./dic_film_number/dic_film_number_comedy_sp", "r") as fp:
        dic_film_number_comedy_sp = json.load(fp)

In [36]:
with open("./dic_film_number/dic_film_number_action_ge", "r") as fp:
        dic_film_number_action_ge = json.load(fp)

In [39]:
with open("./dic_film_number/dic_film_number_action_sp", "r") as fp:
        dic_film_number_action_sp = json.load(fp)

In [65]:
df1 = get_df_same_film_names_genre(list_drama_ge, dic_drama_ge_critics, dic_star_evaluation_user_drama_ge, dic_drama_number_comments_ge)
df2 = get_df_same_film_names_genre(list_drama_sp, dic_drama_sp_critics, dic_star_evaluation_user_drama_sp, dic_drama_number_comments_sp)
df_drama_sp_ge = pd.concat([df1, df2], axis = 1)
f = 'df_drama_sp_ge'
df_drama_sp_ge.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [135]:
df1 = get_df_same_film_names_genre(list_comedy_ge, dic_comedy_ge_critics, dic_star_evaluation_user_comedy_ge, dic_comedy_number_comments_ge)
df2 = get_df_same_film_names_genre(list_comedy_sp, dic_comedy_sp_critics, dic_star_evaluation_user_comedy_sp, dic_comedy_number_comments_sp)
df_comedy_sp_ge = pd.concat([df1, df2], axis = 1)
f = 'df_comedy_sp_ge'
df_comedy_sp_ge.to_csv(f'./df_evaluations_genre/{f}', index = False)

In [146]:
df1 = get_df_same_film_names_genre(list_action_ge, dic_action_ge_critics, dic_star_evaluation_user_action_ge, dic_action_number_comments_ge)
df2 = get_df_same_film_names_genre(list_action_sp, dic_action_sp_critics, dic_star_evaluation_user_action_sp, dic_action_number_comments_sp)
df_action_sp_ge = pd.concat([df1, df2], axis = 1)
f = 'df_action_sp_ge'
df_action_sp_ge.to_csv(f'./df_evaluations_genre/{f}', index = False)