In [10]:
# Import all necessary libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, classification_report

# Download NLTK data (run this cell once)
# Note: 'all' is very large; typically only 'punkt', 'stopwords', 'wordnet', 'vader_lexicon' are needed.
print("Downloading NLTK data...")
nltk.download('all')
print("NLTK data download complete.")

Downloading NLTK data...


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

NLTK data download complete.


[nltk_data]    |   Package word2vec_sample is already up-to-date!
[nltk_data]    | Downloading package wordnet to /root/nltk_data...
[nltk_data]    |   Package wordnet is already up-to-date!
[nltk_data]    | Downloading package wordnet2021 to /root/nltk_data...
[nltk_data]    |   Package wordnet2021 is already up-to-date!
[nltk_data]    | Downloading package wordnet2022 to /root/nltk_data...
[nltk_data]    |   Package wordnet2022 is already up-to-date!
[nltk_data]    | Downloading package wordnet31 to /root/nltk_data...
[nltk_data]    |   Package wordnet31 is already up-to-date!
[nltk_data]    | Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]    |   Package wordnet_ic is already up-to-date!
[nltk_data]    | Downloading package words to /root/nltk_data...
[nltk_data]    |   Package words is already up-to-date!
[nltk_data]    | Downloading package ycoe to /root/nltk_data...
[nltk_data]    |   Package ycoe is already up-to-date!
[nltk_data]    | 
[nltk_data]  Done downloa

In [11]:
# Load the dataset
df = pd.read_excel('seeds_labeled.xlsx')
print("Initial DataFrame Head:")
print(df.head())

# Handle missing 'Comment' values and assign 'Label' 0
# Use reliable non-inplace assignment to avoid FutureWarning
df['Comment'] = df['Comment'].fillna('good')
df.loc[df['Comment'] == 'good', 'Label'] = 0

print("\nDataFrame size after loading and cleaning:")
print(df.shape)
print("\nDataFrame Head after filling NaNs:")
df.head()

Initial DataFrame Head:
              Customer Name  Gender  \
0       Natraj B, Bangalore       1   
1  Sandhya Rani, Bangalore        0   
2        Shovan Chakraborty       1   
3               Placeholder      -1   
4              Vinit Ranjan       1   

                                             Comment    Website  \
0                                       good quality  Bigbasket   
1  become black , bad smell , bad quality , old s...  Bigbasket   
2                                               good     Amazon   
3                                               nice     Amazon   
4  good product , nicely packed - even check auth...   Flipkart   

         Seed Type       Date  Label  
0    Pumpkin Seeds 2024-02-12      1  
1  Sunflower Seeds 2024-06-11     -1  
2       Flax Seeds 2023-08-10      1  
3    Seasame Seeds 2024-06-30      1  
4    Pumpkin Seeds 2024-01-29      1  

DataFrame size after loading and cleaning:
(2981, 7)

DataFrame Head after filling NaNs:


Unnamed: 0,Customer Name,Gender,Comment,Website,Seed Type,Date,Label
0,"Natraj B, Bangalore",1,good quality,Bigbasket,Pumpkin Seeds,2024-02-12,1
1,"Sandhya Rani, Bangalore",0,"become black , bad smell , bad quality , old s...",Bigbasket,Sunflower Seeds,2024-06-11,-1
2,Shovan Chakraborty,1,good,Amazon,Flax Seeds,2023-08-10,0
3,Placeholder,-1,nice,Amazon,Seasame Seeds,2024-06-30,1
4,Vinit Ranjan,1,"good product , nicely packed - even check auth...",Flipkart,Pumpkin Seeds,2024-01-29,1


In [12]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Define preprocess_text function
def preprocess_text(text):
    # Ensure text is a string
    if not isinstance(text, str):
        return ""

    # Tokenize the text and convert to lowercase
    tokens = word_tokenize(text.lower())

    # Remove stop words and punctuation
    stop_words_list = stopwords.words('english')
    filtered_tokens = [
        token for token in tokens
        if token.isalnum() and token not in stop_words_list
    ]

    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# Apply the preprocessing function to the 'Comment' column
print("Applying text preprocessing to comments...")
df['Comment_Processed'] = df['Comment'].apply(preprocess_text)

print("\nSample of Processed Data:")
print(df[['Comment', 'Comment_Processed']].head())

Applying text preprocessing to comments...

Sample of Processed Data:
                                             Comment  \
0                                       good quality   
1  become black , bad smell , bad quality , old s...   
2                                               good   
3                                               nice   
4  good product , nicely packed - even check auth...   

                                   Comment_Processed  
0                                       good quality  
1       become black bad smell bad quality old stock  
2                                               good  
3                                               nice  
4  good product nicely packed even check authenti...  


In [13]:
# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Define get_sentiment function
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    # VADER uses a compound score for overall sentiment
    if scores['compound'] > 0.05:
        return 1  # Positive
    elif scores['compound'] < -0.05:
        return -1 # Negative
    else:
        return 0  # Neutral

# Apply the function to the processed comments
print("Applying VADER sentiment analysis...")
df['Sentiment_VADER'] = df['Comment_Processed'].apply(get_sentiment)

print("\nDataFrame Head with VADER Sentiment:")
print(df.head(15)[['Comment_Processed', 'Label', 'Sentiment_VADER']])

Applying VADER sentiment analysis...

DataFrame Head with VADER Sentiment:
                                    Comment_Processed  Label  Sentiment_VADER
0                                        good quality      1                1
1        become black bad smell bad quality old stock     -1               -1
2                                                good      0                1
3                                                nice      1                1
4   good product nicely packed even check authenti...      1                1
5                                         bad product     -1               -1
6                                      superfood chia      1                0
7                      lesser quantity mentio ed sent      0                0
8                            nice healthy used remedy      1                1
9                                       super quality      1                1
10                                                job      0       

In [14]:
from google.colab import files # Ensure this is imported if not already in Cell 1

# Ensure the 'Label' column is treated as integers for comparison
df['Label'] = df['Label'].astype(int)

# --- Confusion Matrix ---
print("--- Confusion Matrix (Label vs. VADER Sentiment) ---")
# Rows are True Labels (Label), Columns are Predicted Labels (Sentiment_VADER)
cm = confusion_matrix(df['Label'], df['Sentiment_VADER'])
print(cm)
print(f"\nTotal samples: {len(df)}")
print(f"Accuracy: {cm.trace() / len(df):.4f}")

# --- Classification Report ---
print("\n--- Classification Report ---")
# Note: The target names correspond to the labels -1 (Negative), 0 (Neutral), 1 (Positive)
print(classification_report(df['Label'], df['Sentiment_VADER']))

# --- Rename Columns for Uniformity ---
print("\nRenaming columns for uniformity...")
df = df.rename(columns={
    'Comment_Processed': 'Comment', # Revert processed comment column name
    'Sentiment_VADER': 'Sentiment' # Use 'Sentiment' for the VADER result
})

# Drop the original, unprocessed 'Comment' column if it still exists (optional cleanup)
if 'Comment' in df.columns and 'Comment_Processed' in df.columns:
    # This prevents dropping the wrong column if the rename failed, but in this structure, it's safer to keep the new names.
    # Since we renamed 'Comment_Processed' to 'Comment', the original 'Comment' column might be the unprocessed one.
    # Let's assume you want to keep the PROCESSED comment under the 'Comment' name.
    # If the original 'Comment' column was never overwritten:
    if 'Comment_Original' not in df.columns:
        df = df.drop(columns=['Comment']).rename(columns={'Comment_Processed': 'Comment'})
    else:
        # If Comment_Processed was renamed to Comment, the below keeps only essential columns.
        pass

# Final DataFrame structure check
print("\nFinal DataFrame Head with renamed columns:")
print(df[['Comment', 'Label', 'Sentiment']].head())


# --- Save and Download Processed File ---
output_filename = 'seeds_sentement.xlsx'

# Save the DataFrame to an Excel file
df.to_excel(output_filename, index=False)

# Download the file in the Colab environment
print(f"\nSuccessfully saved processed data to {output_filename}")
files.download(output_filename)

--- Confusion Matrix (Label vs. VADER Sentiment) ---
[[ 251   65  167]
 [  27  221  501]
 [   9   54 1686]]

Total samples: 2981
Accuracy: 0.7239

--- Classification Report ---
              precision    recall  f1-score   support

          -1       0.87      0.52      0.65       483
           0       0.65      0.30      0.41       749
           1       0.72      0.96      0.82      1749

    accuracy                           0.72      2981
   macro avg       0.75      0.59      0.63      2981
weighted avg       0.73      0.72      0.69      2981


Renaming columns for uniformity...

Final DataFrame Head with renamed columns:

Successfully saved processed data to seeds_sentement.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>