In [22]:
import numpy as np
import csv
import json
import pandas as pd
import nltk
from collections import Counter

In [18]:
with open('data/movie_data.json', encoding='utf-8') as data_file:
    data = json.loads(data_file.read())

In [31]:
print('Corpus size:', len(data))

Corpus size: 2000


In [20]:
all_genres = []
for i in range(len(data)):
    all_genres.extend(data[i]['genre']) 

In [21]:
# Counting how many times each movie genre occurs in the data
genres_occurrence = Counter(all_genres)
genres_occurrence = list(genres_occurrence.items())
genres_sorted = sorted(genres_occurrence, key=lambda x: x[1], reverse=True)
genres_sorted

[('Drama', 1174),
 ('Comedy', 559),
 ('Thriller', 552),
 ('Romance', 413),
 ('Crime', 385),
 ('Action', 315),
 ('Documentary', 260),
 ('Adventure', 257),
 ('Fantasy', 194),
 ('Mystery', 188),
 ('Sci-Fi', 164),
 ('Biography', 152),
 ('Family', 137),
 ('Music', 131),
 ('Horror', 120),
 ('Animation', 117),
 ('History', 96),
 ('War', 83),
 ('Sport', 72),
 ('Musical', 49),
 ('Western', 28),
 ('Film-Noir', 11),
 ('News', 3),
 ('Short', 1)]

In [40]:
# Counting the number of words in summaries(unprocessed) and computing TTR
words_counts = []
ttr_all = []
for i in range(len(data)):
    words = nltk.word_tokenize(data[i]['summary'])
    num_words = len(words)
    words_counts.append(num_words)
    types = len(set(words))
    ttr = types/num_words
    ttr_all.append(ttr)
    
print('Minimum words:', min(words_counts))
print('Average words per summary:', int(sum(words_counts)/len(words_counts)))
print('Maximum words:', max(words_counts))

print('Minimum TTR:', min(ttr_all))
print('Average TTR per summary:', sum(ttr_all)/len(ttr_all))
print('Maximum TTR:', max(ttr_all))

# we have to keep in mind that the summaries are not yet cleaned, meaning some summaries contain name of the actors 
# in brackets and sometimes even the name of the movie house that wrote the summary, which affects the results

Minimum words: 6
Average words per summary: 64
Maximum words: 498
Minimum TTR: 0.5020080321285141
Average TTR per summary: 0.827550611169027
Maximum TTR: 1.0


In [38]:
# Counting the number of sentences in summaries
sentences_counts = []
for i in range(len(data)):
    sentences = nltk.sent_tokenize(data[i]['summary'])
    num_sentences = len(sentences)
    sentences_counts.append(num_sentences)
print('Minimum sentences:', min(sentences_counts))
print('Average sentences per summary:', int(sum(sentences_counts)/len(sentences_counts)))
print('Maximum sentences:', max(sentences_counts))

Minimum sentences: 1
Average sentences per summary: 2
Maximum sentences: 23
