In [21]:
import pandas as pd

# Membaca dataset
data = pd.read_csv('IMDB Dataset.csv')

print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [23]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download stopwords dan punkt
nltk.download('stopwords')
nltk.download('punkt')

# Fungsi pemrosesan teks lengkap
def preprocess_text(text):
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    
    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenisasi
    tokens = word_tokenize(text)
    
    # Menghilangkan stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Menggabungkan kembali menjadi string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Terapkan pemrosesan ke kolom review
data['cleaned_review'] = data['review'].apply(preprocess_text)
print("\nDataset Setelah Pembersihan:")
print(data[['review', 'cleaned_review']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Dataset Setelah Pembersihan:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewers mentioned watching 1 oz episode ...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


In [41]:
# Menghitung TF-IDF dengan pembatasan
vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.8)
X_tfidf = vectorizer.fit_transform(data['cleaned_review'])

# Mengkonversi hasil TF-IDF ke DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Menampilkan beberapa baris pertama dari matriks TF-IDF
print("\nMatriks TF-IDF:")
print(tfidf_df.head())


Matriks TF-IDF:
   007  010        10  100  1000  10000  101  1010  10br   11  ...  zellweger  \
0  0.0  0.0  0.000000  0.0   0.0    0.0  0.0   0.0   0.0  0.0  ...        0.0   
1  0.0  0.0  0.000000  0.0   0.0    0.0  0.0   0.0   0.0  0.0  ...        0.0   
2  0.0  0.0  0.000000  0.0   0.0    0.0  0.0   0.0   0.0  0.0  ...        0.0   
3  0.0  0.0  0.073793  0.0   0.0    0.0  0.0   0.0   0.0  0.0  ...        0.0   
4  0.0  0.0  0.000000  0.0   0.0    0.0  0.0   0.0   0.0  0.0  ...        0.0   

   zero  zhang  zizek  zoey  zombi    zombie  zombies  zone  zoom  
0   0.0    0.0    0.0   0.0    0.0  0.000000      0.0   0.0   0.0  
1   0.0    0.0    0.0   0.0    0.0  0.000000      0.0   0.0   0.0  
2   0.0    0.0    0.0   0.0    0.0  0.000000      0.0   0.0   0.0  
3   0.0    0.0    0.0   0.0    0.0  0.112911      0.0   0.0   0.0  
4   0.0    0.0    0.0   0.0    0.0  0.000000      0.0   0.0   0.0  

[5 rows x 10000 columns]


In [43]:
from sklearn.cluster import KMeans

# Menggunakan K-Means untuk pengelompokan
num_clusters = 5  # Misalnya, kita ingin 5 cluster
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_tfidf)

# Mendapatkan label cluster untuk setiap review
data['cluster'] = kmeans.labels_

print("\nHasil Pengelompokan:")
print(data[['review', 'cluster']].head(10))




Hasil Pengelompokan:
                                              review  cluster
0  One of the other reviewers has mentioned that ...        3
1  A wonderful little production. <br /><br />The...        1
2  I thought this was a wonderful way to spend ti...        0
3  Basically there's a family where a little boy ...        0
4  Petter Mattei's "Love in the Time of Money" is...        1
5  Probably my all-time favorite movie, a story o...        0
6  I sure would like to see a resurrection of a u...        0
7  This show was an amazing, fresh & innovative i...        3
8  Encouraged by the positive comments about this...        2
9  If you like original gut wrenching laughter yo...        4


In [45]:
# Melihat beberapa review dalam setiap cluster
for i in range(num_clusters):
    print(f"\nCluster {i}:")
    print(data[data['cluster'] == i]['review'].sample(5).tolist())


Cluster 0:
['I saw this in a sneak two days before the official opening, and I must say I was extremely disappointed. And I have to put the majority of these problems on the decision to cast Claire Danes in the lead role. Depending on what you think about Danes, she was either horribly miscast, or is so far in over her head that she should be the early favorite for the 2007 Razzie for Worst Actress. I think we were supposed to be sympathetic to her. Instead, she is completely unlikeable. The other "great" actresses do an OK job, but certainly don\'t light up the screen. Out of all the "great" actresses in this movie, I\'d say the one who did the best job was Natasha Richardson. Streep is barely in the picture, and only appears near the very end.<br /><br />Horrible screenplay as well. It comes off more as them reading lines than truly being "in character."', "At least it's not full of sensless violence or fluff. It's also not very full of thought or a smooth storyline. This story had 