In [3]:
import pandas as pd
import nltk
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# --- NLTK setup (run once) ---
#nltk.download('stopwords') # Downloads the list of "stopwords" (a, the, is, etc.)
#nltk.download('punkt') # Downloads the tokenizer model
# -----------------------------

# Load the spaCy model we downloaded
# We disable parts we don't need (parser, ner) to make it faster
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Load our reviews from Phase 1
try:
    df = pd.read_csv('reviews.csv')
    print(f"Successfully loaded reviews.csv. Shape: {df.shape}")
    print(df.head())
except FileNotFoundError:
    print("ERROR: reviews.csv not found. Did you complete Phase 1?")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\purva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\purva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Successfully loaded reviews.csv. Shape: (10, 2)
   rating                                        review_text
0     5.0  Very good ac, its new and its cooling is excel...
1     5.0  Very good ac, its new and its cooling is excel...
2     5.0  Very good ac, its new and its cooling is excel...
3     1.0  Delivered on time and installation was done sa...
4     4.0  Even though I purchased the product online the...


In [4]:
# 1. Initialize the VADER analyzer
analyzer = SentimentIntensityAnalyzer()

# 2. Define a function to get the sentiment
def get_vader_sentiment(text):
    # VADER's polarity_scores() returns a dictionary
    # We just want the 'compound' score, which is a single number from -1 (v. neg) to +1 (v. pos)
    return analyzer.polarity_scores(text)['compound']

# 3. Apply this function to every review in our 'review_text' column
# and store it in a new 'sentiment' column
df['sentiment'] = df['review_text'].apply(get_vader_sentiment)

# 4. Check our work
print("DataFrame with sentiment scores:")
print(df.head())

# Let's see the most positive and negative reviews
print("\n--- Most Positive Review ---")
print(df.loc[df['sentiment'].idxmax()]['review_text'])

print("\n--- Most Negative Review ---")
print(df.loc[df['sentiment'].idxmin()]['review_text'])

DataFrame with sentiment scores:
   rating                                        review_text  sentiment
0     5.0  Very good ac, its new and its cooling is excel...     0.9689
1     5.0  Very good ac, its new and its cooling is excel...     0.9689
2     5.0  Very good ac, its new and its cooling is excel...     0.9689
3     1.0  Delivered on time and installation was done sa...     0.2617
4     4.0  Even though I purchased the product online the...     0.8351

--- Most Positive Review ---
Very satisfied with Daikin. Build quality is excellent. Services of AC Planet, installation agency are also outstanding. Mr. Souvik who installed the AC is very efficient. Thanks to Daikin, AC planet and Amazon team. All are requested to extend their best services to all customers in future also.Read more

--- Most Negative Review ---
Where do I even begin?I rarely write reviews, but after the nightmare I’ve had with Daikin, I feel it’s my responsibility to warn others. And to make things worse — my 

In [5]:
# Get the list of stopwords from NLTK
stopwords = nltk.corpus.stopwords.words('english')

# You can add custom stopwords that are common in your reviews but not useful
# e.g., 'amazon', 'product', 'review', 'buy', 'bought'
custom_stopwords = ['amazon', 'product', 'review', 'buy', 'bought', 'get']
stopwords.extend(custom_stopwords)


# Define our cleaning and lemmatizing function
def preprocess_text(text):
    # 1. Create a "doc" object using spaCy
    doc = nlp(text.lower()) # Lowercase the text
    
    # 2. Create a list of lemmatized tokens (words)
    #    that are not stopwords, not punctuation, and are alphabetic
    processed_tokens = [
        token.lemma_ 
        for token in doc 
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    
    # 3. Join the tokens back into a single string
    return ' '.join(processed_tokens)

# 4. Apply this function to all reviews
#    This can take a minute or two if you have thousands of reviews
print("Starting text pre-processing...")
df['processed_text'] = df['review_text'].apply(preprocess_text)
print("Pre-processing complete.")

# 5. Check the difference
print("\n--- Original vs. Processed ---")
print("Original:", df['review_text'].iloc[0][:150])
print("Processed:", df['processed_text'].iloc[0][:150])

Starting text pre-processing...
Pre-processing complete.

--- Original vs. Processed ---
Original: Very good ac, its new and its cooling is excellent. If you know how to use a AC efficiently, this is a great choice. I set it at 27 degrees, and it us
Processed: good ac new cooling excellent know use ac efficiently great choice set degree use rate load hour night hope work like use extra voltage stabiliser bui


In [6]:
# 1. Vectorize: Convert text into a matrix of word frequencies
#    TfidfVectorizer finds words that are important (frequent in one doc, rare in others)
vectorizer = TfidfVectorizer(
    max_df=0.95, # Ignore words that appear in > 95% of docs
    min_df=2,    # Ignore words that appear in < 2 docs
    ngram_range=(1,2) # Consider single words (1,1) and two-word phrases (1,2)
)

# "Fit" the vectorizer to our processed text and transform the text into a matrix
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


# 2. Run LDA (Latent Dirichlet Allocation)
#    This is the core topic modeling algorithm
num_topics = 5 # Let's look for 5 main topics

lda = LatentDirichletAllocation(
    n_components=num_topics, 
    random_state=42 # For reproducible results
)

# Fit the LDA model to our matrix
lda.fit(tfidf_matrix)
print("LDA model fitting complete.")

TF-IDF matrix shape: (10, 154)
LDA model fitting complete.


In [7]:
# Helper function to print the topics
def print_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        # Get the top words for this topic
        top_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"\n--- Topic #{topic_idx} ---")
        print(', '.join(top_words))

# Print the top 10 words for each of our 5 topics
print("Top words for each topic:")
print_topics(model=lda, vectorizer=vectorizer, n_top_words=10)

Top words for each topic:

--- Topic #0 ---
performance, efficient, daikin ac, ac, daikin, product, cool, start, service, good

--- Topic #1 ---
like, work, excellent, room, good, customer, amazon, thank, swing, brand

--- Topic #2 ---
like, work, excellent, room, good, customer, amazon, thank, swing, brand

--- Topic #3 ---
use, installation, fan, set, ac, good, use extra, voltage stabiliser, use rate, voltage

--- Topic #4 ---
installation, service, daikin, product, support, purchase, charge, job, research, online
