In [None]:
import os
import re
import psutil
from tqdm import tqdm
import pickle
import swifter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt_tab')
tqdm.pandas()

In [None]:
df = pd.read_csv("IMDB Dataset.csv")
text = df['review'].to_list()

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
df.info()

In [None]:
def text_preprocess(text, language='english'):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # 3. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 4. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Tokenize
    word_tokens = text.split()

    # 6. Remove stopwords
    stop_words = set(stopwords.words(language))
    filtered_tokens = [word for word in word_tokens if word not in stop_words]

    # 7. Stemming
    stems = [stemmer.stem(word) for word in filtered_tokens]

    # 8. Lemmatization
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in stems]

    return ' '.join(lemmas)

In [None]:
def preprocess_with_cache(df, text_col='review', output_col='clean_text', cache_file='cleaned_texts.pkl'):
    if os.path.exists(cache_file):
        print(f"Loading preprocessed texts from cache: {cache_file}")
        with open(cache_file, 'rb') as f:
            cleaned_texts = pickle.load(f)
        if len(cleaned_texts) == len(df):
            df[output_col] = cleaned_texts
            print(f"Loaded {len(cleaned_texts)} cached texts successfully.")
            return df
        else:
            print("Cache size mismatch. Reprocessing from scratch...")
    mem = psutil.virtual_memory()
    print(f"Starting preprocessing | Memory used: {mem.percent}% | Available: {round(mem.available / (1024**3), 2)} GB")
    df[output_col] = df[text_col].swifter.apply(text_preprocess)
    with open(cache_file, 'wb') as f:
        pickle.dump(df[output_col].tolist(), f)
    mem = psutil.virtual_memory()
    print(f"Preprocessing complete | Memory used: {mem.percent}% | Saved to {cache_file}")
    return df

In [None]:
df = preprocess_with_cache(df)
text = df['clean_text'].to_list()
all_text = ' '.join(df['clean_text'].astype(str))
tokens = word_tokenize(all_text)

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

emoji_pattern = r'^(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){1,2}|(?:\ud83d[\udc00-\ude4f]){1,2}|[\ud800-\udbff][\udc00-\udfff]|[\u0021-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e]|\u3299|\u3297|\u303d|\u3030|\u24c2|\ud83c[\udd70-\udd71]|\ud83c[\udd7e-\udd7f]|\ud83c\udd8e|\ud83c[\udd91-\udd9a]|\ud83c[\udde6-\uddff]|\ud83c[\ude01-\ude02]|\ud83c\ude1a|\ud83c\ude2f|\ud83c[\ude32-\ude3a]|\ud83c[\ude50-\ude51]|\u203c|\u2049|\u25aa|\u25ab|\u25b6|\u25c0|\u25fb|\u25fc|\u25fd|\u25fe|\u2600|\u2601|\u260e|\u2611|[^\u0000-\u007F])+$'

def preprocess_sentence(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word.lower() not in stop_words]
    text = ' '.join(text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(emoji_pattern, '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    return text

# Combine all text into one string
all_text = ' '.join(df['clean_text'].astype(str))

# Split sentences using regex (avoids NLTK punkt)
sentences_list = re.split(r'(?<=[.!?])\s+', all_text)

# Preprocess sentences
corpus = [preprocess_sentence(sentence) for sentence in sentences_list]

In [None]:
# Creating a Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
X_array = X.toarray()

print("Unique Word List: \n", feature_names)
print("Bag of Words Matrix: \n", X_array)

In [None]:
df = pd.DataFrame(data=X_array, columns=feature_names, index=corpus)

In [None]:
df = pd.read_csv("IMDB Dataset.csv")
df = preprocess_with_cache(df)

# Preprocessing function
def preprocess_text(text):
    text = str(text).lower()
    
    # Remove numbers, punctuation, emojis
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(emoji_pattern, '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize, remove stopwords, lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    
    return ' '.join(tokens)

# Combine all text into one list (optional, for sentence-level split)
all_text = df['clean_text'].astype(str).tolist()

# Process each row (fast, memory-efficient)
tqdm.pandas()  # optional: adds progress bar
df['processed_text'] = df['clean_text'].progress_apply(preprocess_text)

# Result: df['processed_text'] contains cleaned, lemmatized, stopword-free text
print(df['processed_text'].head())

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['processed_text'])
print(X_tfidf.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms = tfidf_vectorizer.get_feature_names_out()
df1 = pd.DataFrame(tfidf_matrix.toarray(), columns=terms)
df1

In [None]:
cleaned_texts = df['processed_text'].astype(str).tolist()
tfidf_matrix = vectorizer.fit_transform(cleaned_texts)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(vectorizer.vocabulary_)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
import nltk
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Download NLTK data
nltk.download('punkt')

# Tokenize the corpus using NLTK
tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]

# Flatten the list to get all words in the corpus
all_words = [word for sentence in tokenized_corpus for word in sentence]

# Get unique words (vocabulary)
vocab = sorted(set(all_words))

# Print vocabulary
print("Vocabulary:", vocab)

# Reshape the list of words into a 2D array for OneHotEncoder
word_array = np.array(all_words).reshape(-1, 1)

# Apply OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=True)
one_hot_encoded = one_hot_encoder.fit_transform(word_array)

# Print the one-hot encoded data
print("One-hot encoded matrix:\n", one_hot_encoded)

In [None]:
from gensim.models import Word2Vec

cbow_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,
    window=5,
    min_count=2,
    sg=0,
    epochs=10
)

skipgram_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,
    window=5,
    min_count=2,
    sg=1,
    epochs=10
)
cbow_model.save('cbow_model.model')
skipgram_model.save('skipgram_model.model')

In [None]:
print(f"CBOW vocab size: {len(cbow_model.wv)}")
print(f"Skip-Gram vocab size: {len(skipgram_model.wv)}")

if 'word2vec' in cbow_model.wv and 'gensim' in cbow_model.wv:
    sim = cbow_model.wv.similarity('word2vec', 'gensim')
    print(f"CBOW similarity: {sim}")