In [None]:

import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:

with open("../datasets/rotten_tomatoes_movies_data_with_score_panels.json", "r") as fp:
    data = json.load(fp)
    
print(len(data))


In [None]:
def append_row(df, row):
    return pd.concat([
        df, 
        pd.DataFrame([row], columns=row.index)]
    ).reset_index(drop=True)



## Movie_Data_2k:

- This is smallest, yet most comprehensive version of the collected information w.r.t. to the comprehensiveness of columns related to features and network data

In [None]:


issues = []
errors = []


movies_data_2k = pd.DataFrame(
    columns=[
        'Title', 'Synopsis', 'Original Language', 'Runtime', 
        'Director', 'Producer', 'Writer', 'Top Cast',
        'Distributor', 'Production Co', 
        'Box Office (Gross USA)', 
        'Tomato Meter', 'Audience Score', 'No. Reviews', 'Genre', 
        'Release Date (Theaters)', 'Release Date (Streaming)', 'Link'

    ]
)


for k, v in data.items():
    try:
        run_time =  int(v['Info']['Runtime'].split()[0].split("h")[0]) * 60 + int(v['Info']['Runtime'].split()[1].split("m")[0])
        tmp_box_office = v['Info']['Box Office (Gross USA)'].strip().split("$")[1]
        
        if "M" in tmp_box_office:
            box_office = float(tmp_box_office.split("M")[0]) * 1000000
        elif "K" in tmp_box_office:
            box_office = float(tmp_box_office.split("K")[0]) * 1000
        else:
            box_office = 0.
        a_row = pd.Series({
            'Title': v['Title'].strip(),
            'Synopsis': v['Synopsis'], 
            'Original Language': v['Info']['Original Language'].strip(), 
            'Runtime': run_time,
            'Director': v['Info']['Director'].strip(), 
            'Producer': v['Info']['Producer'].strip(), 
            'Writer':  v['Info']['Writer'].strip(),
            'Top Cast': v["Top Cast"], 
            'Distributor': v['Info']['Distributor'].strip(),
            'Production Co': v['Info']['Production Co'].strip(),
            'Box Office (Gross USA)': box_office, 
            'Tomato Meter': float(v["Score Panel"][2].strip("%"))/100,
            'Audience Score': float(v["Score Panel"][5].strip("%"))/100,
            'No. Reviews': int(v["Score Panel"][4].split(" ")[0]),
            'All Genres': v['Info']['Genre'].strip(), 
            'Genre': v['Info']['Genre'].strip().split(", ")[0],
            'Release Date (Theaters)': v['Info']['Release Date (Theaters)'].strip(),
            'Release Date (Streaming)': v['Info']['Release Date (Streaming)'].strip(), 
            'Link': k.strip()
        })
        movies_data_2k = append_row(df=movies_data_2k, row=a_row)
    except Exception as error:
        print(
            f"In {k} \n"
            f"{error} \n"
            f"occurred !"
        )
        issues.append(k)
        errors.append(error)
        
        
                

In [None]:



languages = list(movies_data_2k["Original Language"].unique())
for language in languages:
    movies_data_2k['Original Language'].replace(language, language[:3], inplace=True)
    

In [None]:

movies_data_2k.shape 



In [None]:

errors = [str(e) for e in errors ]
errors = [str(e) for e in errors if "invalid literal" not in e ]  # remove run-time errors for movies shorter an hour
plt.figure(figsize=(17, 5))
plt.hist(errors)
plt.show()




In [None]:

movies_data_2k

In [None]:


movies_data_2k.shape



In [None]:


movies_data_2k.to_csv("../datasets/movies_data_2k.csv", index=False)



## Movie_Data_4k:

- The "Box office", "Distributer" , "Production Co.", "Release Date (Theaters)" and "Release Date (Streaming)" are dropped from the set of features to increase the number of data points

- The "Producer" is also drop from the corresponding network columns to increase the number of data points.

In [None]:




issues_4k = []
errors_4k = []


movies_data_4k = pd.DataFrame(
    columns=[
        'Title', 'Synopsis', 'Original Language', 'Runtime', 
        'Director', 
#         'Writer', 
        'Top Cast',
        'Tomato Meter', 'Audience Score', 'No. Reviews', 'Genre', 
        'Link'

    ]
)



for k, v in data.items():
    try:
        run_time =  int(v['Info']['Runtime'].split()[0].split("h")[0]) * 60 + int(v['Info']['Runtime'].split()[1].split("m")[0])
        a_row = pd.Series({
            'Title': v['Title'].strip(),
            'Synopsis': v['Synopsis'].strip(), 
            'Original Language': v['Info']['Original Language'].strip(), 
            'Runtime': run_time,
            'Director': v['Info']['Director'].strip(), 
#             'Writer':  v['Info']['Writer'].strip(),
            'Top Cast': v["Top Cast"], 
            'Tomato Meter': float(v["Score Panel"][2].strip("%"))/100,
            'Audience Score': float(v["Score Panel"][5].strip("%"))/100,
            'No. Reviews': int(v["Score Panel"][4].split(" ")[0]),
            'All Genres': v['Info']['Genre'].strip(), 
            'Genre': v['Info']['Genre'].strip().split(", ")[0],
            'Link': k.strip()
        })
        movies_data_4k = append_row(df=movies_data_4k, row=a_row)
    except Exception as error:
        print(
            f"In {k} \n"
            f"{error} \n"
            f"occurred !"
        )
        issues_4k.append(k)
        errors_4k.append(error)
        
        
                

In [None]:

errors_4k = [str(e) for e in errors_4k ]
# removing run errors for movies shorter an hour
errors_4k = [str(e) for e in errors_4k if "invalid literal" not in e ]  

plt.figure(figsize=(17, 5))
plt.hist(errors_4k)
plt.show()




In [None]:

set(errors_4k)


In [None]:

movies_data_4k.shape



In [None]:



languages = list(movies_data_4k["Original Language"].unique())
for language in languages:
    movies_data_4k['Original Language'].replace(language, language[:3], inplace=True)
    

In [None]:
movies_data_4k


In [None]:



movies_data_4k.to_csv("../datasets/movies_data_4k.csv", index=False)



# EDA

In [None]:

movies_data_s = pd.read_csv("../datasets/movies_data_2k.csv", )
movies_data_m = pd.read_csv("../datasets/movies_data_4k.csv", )
print(movies_data_s.shape, movies_data_m.shape)




## Data with around 2,000 movies



In [None]:


features_s = [
    'Title', 'Synopsis', 'Original Language', 'Runtime', 
    'Director', 'Producer', 'Writer', 'Top Cast',
    'Distributor', 'Production Co', 
    'Box Office (Gross USA)', 
    'Tomato Meter', 'Audience Score', 'No. Reviews', 'Genre', 
    'Release Date (Theaters)', 'Release Date (Streaming)', 'Link'
]

q_features_s = ['Runtime', 'Box Office (Gross USA)', 'Tomato Meter', 'Audience Score', 'No. Reviews', ]

c_features_s = ['Original Language', 'Genre',]

network_columns_s = [ 'Director', 'Producer', 'Writer', 'Top Cast']



In [None]:



corr_s = movies_data_s[q_features_s].corr()
plt.figure(figsize=(10, 10))
plt.imshow(corr_s)
plt.xticks(range(len(corr_s.columns)), corr_s.columns)
plt.yticks(range(len(corr_s.columns)), corr_s.columns)
plt.colorbar()
plt.title("with ~2K movies")
plt.show()



In [None]:

movies_data_s["Genre"].unique()



In [None]:

plt.figure(figsize=(20, 10))
movies_data_s["Genre"].hist()
plt.show()


In [None]:

plt.figure(figsize=(20, 10))
movies_data_s["Original Language"].hist()
plt.show()




In [None]:

plt.figure(figsize=(10, 10))
sns.pairplot(movies_data_s[q_features_s])
plt.show()


In [None]:


movies_data_s.groupby(["Director", "Producer", "Writer"]).count()



In [None]:


movies_data_s.shape



## Data with around 4,000 movies


In [None]:


features_m = [
    'Title', 'Synopsis', 'Original Language', 'Runtime', 
    'Director', 'Producer', 'Writer', 'Top Cast',
    'Distributor', 'Production Co', 
    'Box Office (Gross USA)', 
    'Tomato Meter', 'Audience Score', 'No. Reviews', 'Genre', 
    'Release Date (Theaters)', 'Release Date (Streaming)', 'Link'
]

q_features_m = ['Runtime', 'Tomato Meter', 'Audience Score', 'No. Reviews', ]

c_features_m = ['Original Language', 'Genre',]

network_columns_m = [ 'Director', 'Producer', 'Writer', 'Top Cast']



In [None]:



corr_m = movies_data_m[q_features_m].corr()
plt.figure(figsize=(10, 10))
plt.imshow(corr_m)
plt.xticks(range(len(corr_m.columns)), corr_m.columns)
plt.yticks(range(len(corr_m.columns)), corr_m.columns)
plt.colorbar()
plt.title("with ~4K movies")
plt.show()




In [None]:


movies_data_m["Genre"].unique()


In [None]:

plt.figure(figsize=(25, 10))
movies_data_m["Genre"].hist()
plt.show()


In [None]:


plt.figure(figsize=(20, 10))
movies_data_m["Original Language"].hist()
plt.show()



In [None]:



plt.figure(figsize=(10, 10))
sns.pairplot(movies_data_m[q_features_m])
plt.show()



In [None]:

movies_data_m.groupby(["Director", ]).count()


In [None]:


movies_data_s.iloc[:2]


In [None]:

features_s


In [None]:


movies_data_s_np = movies_data_s.values
movies_data_s_np.shape == movies_data_s.shape



In [None]:
for ii in range(len(movies_data_s_np[0, :])):
    print(ii, movies_data_s_np[0, :][ii],)

In [None]:
def get_list_of_casts(x):
    x = x.split(", ")
    x_ = []
    for k in range(len(x)):
        if k == 0:
            x_.append(x[k].split("[")[1])
        elif k == len(x):
            x_.append(x.split("]")[0])
        else:
            x_.append(x[k])

    return set(x_)

In [None]:
def get_list_of_others(x):
    return set(x.split(", "))


In [None]:
def get_edge_weight(a, b):
    
    return ((len(a.intersection(b)) / len(a)) + len(b.intersection(a)) / len(b)) / 2
    

In [None]:
# def append_row(df, row):
#     return pd.concat([
#         df, 
#         pd.DataFrame([row], columns=row.index)]
#     ).reset_index(drop=True)

In [None]:

data_movies_data_s_a = pd.DataFrame(columns=movies_data_s.Title.values[1:])
data_movies_data_s_a



In [None]:

data_movies_s_a = pd.DataFrame(columns=movies_data_s.Title, )
data_movies_s_a

In [None]:

data_movies_data_s_a = pd.DataFrame(columns=movies_data_s.Title.values[1:])


for i in range(len(movies_data_s_np)):
    edges = pd.Series({})
    
    for j in range(len(movies_data_s_np)):
        if i != j:
            # directors()
            weight_dir = get_edge_weight(
                a = get_list_of_others(movies_data_s_np[i, 4]), 
                b = get_list_of_others(movies_data_s_np[j, 4])
            )
            
            # producer(s)
            weight_pro = get_edge_weight(
                a = get_list_of_others(movies_data_s_np[i, 5]), 
                b = get_list_of_others(movies_data_s_np[j, 5])
            )
            
            # writer(s)
            weight_wri = get_edge_weight(
                a = get_list_of_others(movies_data_s_np[i, 6]), 
                b = get_list_of_others(movies_data_s_np[j, 6])
            )
            
            # casts
            weight_casts = get_edge_weight(
                a = get_list_of_casts(movies_data_s_np[i, 7]), 
                b = get_list_of_casts(movies_data_s_np[j, 7])
            )
            
            weight = weight_dir + weight_pro + weight_wri + weight_casts
        else:
            weight = 0
            
        edges._append(movies_data_s_np[j, 0]:weight)
        
    edge_weights = pd.Series({k: v} for k, v in enumerate(zip(names, edges)))
    data_movies_s_a = append_row(data_movies_s_a, edge_weights)
    
    if i >= 3:
        break
        


In [None]:
data_movies_data_s_a



In [None]:
edge_weights

In [None]:
movies_data_s.iloc[i, :]

In [None]:


set(movies_data_s_np[i, 4].split(", "))



In [None]:
set(movies_data_s_np[i, 4]), 

In [None]:
a

In [None]:
b

In [None]:

weight = ((len(a.intersection(b)) / len(a)) + len(b.intersection(a)) / len(b)) / 2
weight


In [None]:

weight = len(a.intersection(b)) / len(a.union(b))
weight


In [None]:
weight = ((len(a.intersection(b)) / len(a)) + len(b.intersection(a)) / len(b)) / 2
