In [1]:
# Taken from https://www.kaggle.com/code/suvroo/complete-nlp-pipeline#RoPE-(Robust-Positional-Embeddings)

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,confusion_matrix
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# research-methods/imdb/IMDB_dataset.csv
df = pd.read_csv('IMDB_dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [2]:
df.duplicated().sum()

418

In [3]:
df.drop_duplicates(inplace=True)

Stopwords include negating words, so removing them would remove important context from texts.

Disabled stopwords removoal.

In [4]:
# import nltk
# nltk.download('stopwords')

In [5]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text
    

def clean_text_column(df, text_col_name):
    """Clean the text column by removing rows with empty text while retaining specific symbols (!, ?, .)."""
    # Drop rows with missing or empty text
    start_len = len(df)
    df = df[df[text_col_name].notnull()]  # Remove NaN values
    df = df[df[text_col_name].str.strip().astype(bool)]  # Remove empty strings

    # Clean text by removing unusual symbols, non english text, except !, ?, .
    def clean_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"^[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$", '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'[^a-zA-Z0-9\s!?.,]', '', text)  # Remove non-alphanumeric characters except !, ?, .
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        return text.strip()  # Strip leading/trailing spaces

    df[text_col_name] = df[text_col_name].apply(clean_text)

    # Define a function to filter out rows with insufficient content
    def has_valid_content(text):
        # Remove rows that only contain symbols like !, ?, or .
        if re.fullmatch(r'[!?.,\s]*', text):
            return False
        # Removes rows with less than 4 chars
        if len(text.strip()) < 4:
            return False
        return True

    # Apply the filter
    df = df[df[text_col_name].apply(has_valid_content)]
    cur_len = len(df)
    print(f"Clean dataset's text. started with {start_len} rows, after cleaning: {cur_len}")
    return df

text_col_name = 'review'

# Remove HTML tags
df[text_col_name] = df[text_col_name].apply(remove_tags)

# Clean text column
print(f'text column name:{text_col_name}')
df = clean_text_column(df, text_col_name)

# Disabled stopwords removal
# sw_list = stopwords.words('english')
# df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

text column name:review
Clean dataset's text. started with 49582 rows, after cleaning: 49582


In [2]:
df['sentiment'] = df['sentiment'].str.lower()
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df['label'] = df['label'].astype(int)

In [3]:
df.to_csv("preprocessed_imdb.csv", index=False)

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# ── 1.  set your paths ──────────────────────────────────────────────────────────
base_dir = Path("/sise/home/saarbu/research-methods/imdb")
feature_imp_path = base_dir / "clustering results" / "feature_importance_scores.csv"
embeddings_path  = base_dir / "full_embeddings_cleaned.parquet"
sentiment_path   = base_dir / "data_cleaned_from_nan_embeddings.csv"
out_path         = base_dir / "data_cleaned_from_nan_embeddings_with_clusters.csv"

# ── 2.  load feature-importance file & grab top-50 indices ─────────────────────
imp_df = pd.read_csv(feature_imp_path)
top50_idx = (
    imp_df.sort_values("importance_score", ascending=False)
          .head(50)["feature_index"]
          .astype(int)
          .tolist()
)

# ── 3.  load embeddings and flatten if stored as a list column ─────────────────
emb_df = pd.read_parquet(embeddings_path)

# if the parquet has a single “embedding” column with Python lists, expand it
if emb_df.shape[1] == 1:
    emb_df = pd.DataFrame(emb_df.iloc[:, 0].tolist())

# ensure column names are integers 0…1535 so we can index by number
emb_df.columns = emb_df.columns.astype(int)

# ── 4.  build the feature matrix with the 50 most important dimensions ─────────
X = emb_df[top50_idx].values
# X = StandardScaler().fit_transform(X)

# ── 5.  cluster (k = 40) ────────────────────────────────────────────────────────
kmeans   = KMeans(n_clusters=40, init="k-means++", n_init=10, random_state=42)
clusters = kmeans.fit_predict(X)          # shape = (3046,)

# ── 6.  append cluster labels to the sentiment / text file ─────────────────────
sent_df = pd.read_csv(sentiment_path)
assert len(sent_df) == len(clusters), "Row count mismatch between sentiment file and embeddings!"

sent_df = sent_df.reset_index(drop=True)  # just to be safe
sent_df["cluster"] = clusters

# ── 7. keep only requested columns & save ──────────────────────────────────────
cols_needed = ["review", "label", "cluster"]
missing = [c for c in cols_needed if c not in sent_df.columns]
if missing:
    raise KeyError(f"Column(s) not found in sentiment file: {missing}")

sent_df[cols_needed].to_csv(out_path, index=False)
print(f"✓ Saved {out_path} with columns: {cols_needed}")


✓ Saved /sise/home/saarbu/research-methods/imdb/data_cleaned_from_nan_embeddings_with_clusters.csv with columns: ['review', 'label', 'cluster']
