In [1]:
# 📦 Importações necessárias
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# 🛠️ Baixar recursos do nltk (execute uma vez)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 📥 Carregar dataset
df = pd.read_csv('movies_plot.csv')

# 1. Pré-processamento: função remove_noise com tokenização, remoção de pontuação, stopwords e lematização
def remove_noise(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Tokeniza
    tokens = word_tokenize(text.lower())
    
    # Remove tokens que não são alfabéticos (pontuação, números)
    tokens = [token for token in tokens if token.isalpha()]
    
    # Remove stopwords e aplica lematização
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return tokens

# 2. Vetorização TF-IDF
vectorizer = TfidfVectorizer(
    max_df=0.8,
    min_df=0.2,
    max_features=50,
    tokenizer=remove_noise,  # usa a função acima
    lowercase=False,         # já fizemos lowercase na função
    stop_words=None          # stopwords já removidas na função
)

X = vectorizer.fit_transform(df['Plot'])

# 3. Aplicação do K-Means
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
df['Cluster'] = kmeans.labels_

# 4. Termos principais por cluster
terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

print("📌 Principais termos por cluster:\n")
for i in range(num_clusters):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Cluster {i}: {', '.join(top_terms)}")
print()

# 5. Exibição dos documentos agrupados
for i in range(num_clusters):
    print(f"===================== Cluster {i} =====================")
    cluster_titles = df[df['Cluster'] == i]['Title'].values
    for title in cluster_titles:
        print(f"- {title}")
    print("\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Renata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Renata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Renata\AppData\Roaming\nltk_data...


📌 Principais termos por cluster:

Cluster 0: back, go, get, take, find, police, two, one, kill, tell
Cluster 1: father, family, mother, life, take, home, find, one, friend, new
Cluster 2: film, love, wife, man, life, one, woman, friend, new, end

- The Ballad of Cable Hogue
- Monsters vs. Aliens
- Broken Arrow
- The Astounding She-Monster
- Conan the Barbarian
- Star Kid
- Halloween
- Seven Chances
- The War Wagon
- Beauty for Sale
- Eagle Squadron
- The Prince Who Was a Thief
- Flame of Calcutta
- A Shot in the Dark
- Dragonslayer
- The 4th Floor
- Disturbing Behavior
- True Believer
- Don't Bet on Love
- Kidnapped
- Eaten Alive
- The Package
- Oculus
- Highlander
- Hoodwinked Too! Hood vs. Evil
- Killers from Space
- The Bad Seed
- Sniper
- Union Station
- Rover Dangerfield
- Run for Cover
- Valerian and the City of a Thousand Planets
- Boomerang
- The Lego Ninjago Movie
- Black Beauty
- Eight Iron Men
- The Girl on the Train
- Queen Bee
- Coach Carter
- Sinner Take All
-  Laid to Re