In [37]:
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
import string
import csv
import nltk
import json

In [None]:
#run once
import nltk
nltk.download('stopwords')
nltk.download('punkt')

**Estatísticas descritivas dos datasets**

Obrigatórias:
Word quantity

Vou tentar:
Word frequency
Vocabulary size
Sentence length

In [36]:
def print_stats(stats):
  for stat, value in stats.items():
    if stat == 'word_freq':
      continue
    else:
      print(f'{stat}: {value}')

def clean_strings(strings):
  if isinstance(strings, list):
    cleaned_utterance = []
    for utterance in strings:
      utterance = utterance.replace("'s", "")
      cleaned_utterance.append(utterance.translate(str.maketrans('', '', string.punctuation)))
    return cleaned_utterance

  if isinstance(strings, str):
    return strings.translate(str.maketrans('', '', string.punctuation))

def get_word_count(list_of_strings):
  punctuation = set(string.punctuation)
  word_stats = []
  for strings in list_of_strings:
    words = clean_strings(strings).split()
    words = [word for word in words if word.lower() not in punctuation]
    word_stats.append(len(words))
  return {
      'total_words': len(list_of_strings),
      'average_count': sum(word_stats)/len(word_stats),
      'max_count': max(word_stats),
      'min_count': min(word_stats)
  }

def get_word_frequency(words, language):
  stop_words = set(stopwords.words(language))
  filtered_words = [word for word in words if word.lower() not in stop_words]
  word_frequency = Counter(filtered_words)
  return dict(sorted(word_frequency.items(), key=lambda item: item[1], reverse=True))

def get_vocab_size(list_of_strings, words):
  total_vocab = len(set(words))
  word_stats = []
  for string in list_of_strings:
    words = ' '.join(clean_strings(string)).split()
    word_stats.append(len(set(words)))
  return {
      'total_vocab': total_vocab,
      'average_vocab': sum(word_stats)/len(word_stats),
      'max_vocab': max(word_stats),
      'min_vocab': min(word_stats)
  }

def get_sent_len(list_of_strings):
  combined_text = ' '.join(list_of_strings)
  sentences = nltk.sent_tokenize(combined_text)
  sentence_lengths = []
  for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      words = [word.lower() for word in words]
      sentence_lengths.append(len(words))

  return{
      'avg_sentence_length' : sum(sentence_lengths) / len(sentence_lengths),
      'min_sentence_length' : min(sentence_lengths),
      'max_sentence_length' : max(sentence_lengths)
  }


def get_stats(list_of_strings, language):
  words_cleaned = ' '.join(clean_strings(list_of_strings)).split()

  return {
      'word_freq' : get_word_frequency(words_cleaned, language),
      'word_count': get_word_count(list_of_strings),
      'sent_len': get_sent_len(list_of_strings),
      'vocab_size': get_vocab_size(list_of_strings, words_cleaned)
  }


In [38]:
rt = pd.read_json('./rotten/rottentomatoes.json')

language = 'english'
consensus_stats = get_stats(rt._critic_consensus.tolist(), language)
print_stats(consensus_stats)

critics_list = rt._critics.tolist()
all_critics = []
for critics in critics_list:
  for critic in critics.values():
    if critic == ".": continue
    all_critics.append(critic)

critics_stats = get_stats(all_critics, language)
print_stats(critics_stats)

word_count: {'total_words': 3731, 'average_count': 21.3017957652104, 'max_count': 92, 'min_count': 2}
sent_len: {'avg_sentence_length': 22.99596875787352, 'min_sentence_length': 2, 'max_sentence_length': 63}
vocab_size: {'total_vocab': 12488, 'average_vocab': 24.042079871348164, 'max_vocab': 37, 'min_vocab': 9}
word_count: {'total_words': 372036, 'average_count': 20.474424518057393, 'max_count': 54, 'min_count': 1}
sent_len: {'avg_sentence_length': 20.50988070768046, 'min_sentence_length': 1, 'max_sentence_length': 143}
vocab_size: {'total_vocab': 144190, 'average_vocab': 22.27281768431012, 'max_vocab': 42, 'min_vocab': 2}


In [39]:
def save_json(file, data):
  with open(file, 'w') as f:
      json.dump(data, f)

save_json('rotten/rotten_stats/input_stats.json', critics_stats)
save_json('rotten/rotten_stats/label_stats.json', consensus_stats)

In [None]:
# language = 'portuguese'

# with open("./datasets/br_wac2wiki/output.csv", "r", encoding='UTF-8') as f:
#     reader = csv.reader(f, delimiter="\t")
#     list_of_strings = [row[0] for row in reader]
#     wac_output = get_stats(list_of_strings, language)
#     print_stats(wac_output)

In [None]:
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords and punkt (run once)
nltk.download('stopwords')
nltk.download('punkt')

# Sample list of strings
list_of_strings = [
    "This is a sample sentence.",
    "Another sentence for demonstration purposes.",
    "A sample sentence similar to the first one."
]

# Combine all strings into a single text
combined_text = ' '.join(list_of_strings)

# Tokenize text into sentences
sentences = nltk.sent_tokenize(combined_text)

# Get English stopwords and punctuation from NLTK
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Calculate sentence lengths
sentence_lengths = []
for sentence in sentences:
    # Tokenize sentence into words
    words = nltk.word_tokenize(sentence)
    
    # Remove stopwords and punctuation from words
    words = [word.lower() for word in words if word.lower() not in stop_words and word.lower() not in punctuation]
    
    # Append length of the remaining words (excluding stopwords and punctuation)
    sentence_lengths.append(len(words))

# Calculate statistics
num_sentences = len(sentence_lengths)
if num_sentences > 0:
    avg_sentence_length = sum(sentence_lengths) / num_sentences
    min_sentence_length = min(sentence_lengths)
    max_sentence_length = max(sentence_lengths)
else:
    avg_sentence_length = 0
    min_sentence_length = 0
    max_sentence_length = 0

# Display sentence length statistics
print(f"Average Sentence Length: {avg_sentence_length:.2f}")
print(f"Minimum Sentence Length: {min_sentence_length}")
print(f"Maximum Sentence Length: {max_sentence_length}")