In [None]:
# Import librerie
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

# Configurazioni
DATASET_PATH = "../artifacts/imdb_sample.csv"
FIGURES_DIR = "figures"

os.makedirs(FIGURES_DIR, exist_ok=True)

# Caricamento dataset
df = pd.read_csv(DATASET_PATH)
print("Primi 5 record:")
display(df.head())

# Info generali
print("\nInformazioni sul dataset:")
print(df.info())

# Distribuzione delle etichette
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='label')
plt.title('Distribuzione delle etichette')
plt.xlabel('Etichetta (0=negativo, 1=positivo)')
plt.ylabel('Frequenza')
plt.savefig(os.path.join(FIGURES_DIR, 'label_distribution.png'))
plt.show()

# Lunghezza dei testi
df['text_length'] = df['text'].apply(len)
plt.figure(figsize=(8,5))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribuzione della lunghezza dei testi')
plt.xlabel('Numero di caratteri')
plt.ylabel('Frequenza')
plt.savefig(os.path.join(FIGURES_DIR, 'text_length_distribution.png'))
plt.show()

# Parole più comuni
from collections import Counter
import re

def preprocess_text(text):
    return re.findall(r'\w+', text.lower())

all_words = []
for text in df['text']:
    all_words.extend(preprocess_text(text))

word_counts = Counter(all_words).most_common(30)
words_df = pd.DataFrame(word_counts, columns=['Word', 'Count'])

plt.figure(figsize=(10,6))
sns.barplot(data=words_df, x='Count', y='Word')
plt.title('Parole più frequenti')
plt.xlabel('Frequenza')
plt.ylabel('Parola')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'top_words.png'))
plt.show()

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_words))
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig(os.path.join(FIGURES_DIR, 'wordcloud.png'))
plt.show()