In [11]:
#loading the reduced dataset
df_train = pd.read_csv("train_small.csv")
df_test = pd.read_csv("test_small.csv")

In [14]:
label_map = {1: "negative", 2: "positive"}

df_train["sentiment_text"] = df_train["label"].map(label_map)
df_test["sentiment_text"]  = df_test["label"].map(label_map)

df_train[["label", "sentiment_text"]].head()

Unnamed: 0,label,sentiment_text
0,2,positive
1,2,positive
2,1,negative
3,2,positive
4,2,positive


In [45]:
df_train_small["clean_review"] = df_train_small["review"].apply(clean_text)
df_test_small["clean_review"]  = df_test_small["review"].apply(clean_text)

In [20]:
pos_words = " ".join(df_train[df_train["sentiment_text"]=="positive"]["clean_review"]).split()
neg_words = " ".join(df_train[df_train["sentiment_text"]=="negative"]["clean_review"]).split()

pos_counts = Counter(pos_words).most_common(20)
neg_counts = Counter(neg_words).most_common(20)

print("Top positive words:\n", pos_counts)
print("\nTop negative words:\n", neg_counts)

Top positive words:
 [('the', 181201), ('and', 111201), ('i', 102520), ('a', 95802), ('to', 89381), ('it', 82073), ('of', 75724), ('is', 69126), ('this', 67892), ('in', 46193), ('for', 41277), ('that', 36607), ('you', 34923), ('s', 29921), ('with', 28471), ('was', 26420), ('my', 25667), ('book', 25324), ('on', 25245), ('but', 24084)]

Top negative words:
 [('the', 203650), ('i', 123779), ('to', 98461), ('and', 98401), ('a', 93496), ('it', 92441), ('of', 74271), ('this', 74165), ('is', 64211), ('in', 44270), ('that', 42434), ('for', 39975), ('was', 39120), ('not', 38125), ('you', 32180), ('t', 31358), ('but', 30934), ('on', 29281), ('with', 27464), ('s', 26823)]


In [22]:
# Define features (X) and labels (y)
X = df_train["clean_review"]     # input text
y = df_train["label"]            # sentiment label (1=neg, 2=pos)

In [46]:
df_train_small.to_csv("train_small.csv", index=False)
df_test_small.to_csv("test_small.csv", index=False)

In [47]:
pos_reviews = df_train_small[df_train_small["label"] == 2]["clean_review"]
neg_reviews = df_train_small[df_train_small["label"] == 1]["clean_review"]

from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngrams(corpus, n=None, ngram_range=(2,2)):
    vec = CountVectorizer(ngram_range=ngram_range).fit(corpus)
    bag = vec.transform(corpus)
    sum_words = bag.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

top_pos_bigrams = get_top_ngrams(pos_reviews, n=20, ngram_range=(2,2))
top_neg_bigrams = get_top_ngrams(neg_reviews, n=20, ngram_range=(2,2))

!pip install rake-nltk
from rake_nltk import Rake

r = Rake()

def extract_key_phrases(text_series, top_n=20):
    r.extract_keywords_from_sentences(text_series)
    return r.get_ranked_phrases()[:top_n]

pos_phrases = extract_key_phrases(pos_reviews[:2000])
neg_phrases = extract_key_phrases(neg_reviews[:2000])

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words='english')
X = vectorizer.fit_transform(df_train_small["clean_review"])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

words = vectorizer.get_feature_names_out()

for idx, topic in enumerate(lda.components_):
    print(f"Topic #{idx}:")
    print([words[i] for i in topic.argsort()[-10:]])
    print()

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
Topic #0:
['time', 'author', 'reading', 'just', 'good', 'like', 'books', 'story', 'read', 'book']

Topic #1:
['love', 'really', 'little', 'good', 'great', 'water', 'product', 'use', 'just', 'like']

Topic #2:
['film', 'songs', 'great', 'music', 'just', 'good', 'album', 'cd', 'like', 'movie']

Topic #3:
['received', 'great', 'got', 'time', 'year', 'product', 'bought', 'dvd', 'old', 'amazon']

Topic #4:
['buy', 'like', 'time', 'bought', 'great', 'work', 'good', 'product', 'just', 'use']

