In [1]:
# Import all necessary libraries
import pandas as pd
import plotly.express as px
from google.colab import files # Ensure files is imported
from sklearn.feature_extraction.text import CountVectorizer
from tabulate import tabulate # Used for cleaner output display

In [2]:
# Step 1: Load your data
df = pd.read_excel('seeds_sentement.xlsx')
print("DataFrame Loaded Successfully.")

# Basic EDA
print("\nDataset Head:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())

# Add comment length and word count features
df['comment_length'] = df['Comment'].apply(len)
df['word_count'] = df['Comment'].apply(lambda x: len(str(x).split()))

print("\nDataFrame Head with new features:")
print(df[['Comment', 'comment_length', 'word_count']].head())

DataFrame Loaded Successfully.

Dataset Head:
              Customer Name  Gender  \
0       Natraj B, Bangalore       1   
1  Sandhya Rani, Bangalore        0   
2        Shovan Chakraborty       1   
3               Placeholder      -1   
4              Vinit Ranjan       1   

                                             Comment    Website  \
0                                       good quality  Bigbasket   
1  become black , bad smell , bad quality , old s...  Bigbasket   
2                                               good     Amazon   
3                                               nice     Amazon   
4  good product , nicely packed - even check auth...   Flipkart   

         Seed Type       Date  Label  \
0    Pumpkin Seeds 2024-02-12      1   
1  Sunflower Seeds 2024-06-11     -1   
2       Flax Seeds 2023-08-10      0   
3    Seasame Seeds 2024-06-30      1   
4    Pumpkin Seeds 2024-01-29      1   

                                           Comment.1  Sentiment  
0        

In [6]:
print("Generating Interactive Plotly Visualizations...")

# Interactive scatter plot for comment length vs word count
# This helps check for text integrity (e.g., if a high length corresponds to a high word count)
fig_scatter = px.scatter(
    df,
    x='comment_length',
    y='word_count',
    color='Label',
    title='Comment Length vs Word Count (by Label)',
    labels={'comment_length': 'Comment Length (Characters)', 'word_count': 'Word Count'},
    hover_data=['Comment']
)
fig_scatter.show()

# Interactive histogram for comment lengths
# Shows the distribution of comment sizes
fig_length = px.histogram(
    df,
    x='comment_length',
    nbins=30,
    color='Label',
    title='Distribution of Comment Lengths (by Label)',
    labels={'comment_length': 'Comment Length (Characters)'}
)
fig_length.show()

# Interactive histogram for word counts
# Shows the distribution of wordiness
fig_word_count = px.histogram(
    df,
    x='word_count',
    nbins=30,
    color='Label',
    title='Distribution of Word Counts (by Label)',
    labels={'word_count': 'Word Count'}
)
fig_word_count.show()

Generating Interactive Plotly Visualizations...


In [9]:
def get_ngrams(comments, n=2, top_n=20):
    """
    Calculates the frequency of the top_n n-grams (sequences of n words)
    in the provided list of comments, excluding English stop words.
    Returns a DataFrame of the top n-grams and their counts.
    """
    # Initialize the CountVectorizer with specified n-gram range and stop words
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')

    # Fit and transform the comments to get n-gram counts
    ngrams = vectorizer.fit_transform(comments)

    # Sum the counts of each n-gram across all comments
    ngram_counts = ngrams.sum(axis=0)

    # Get the feature names (the actual n-grams)
    ngram_features = vectorizer.get_feature_names_out()

    # Create a DataFrame of n-grams and their counts
    ngram_df = pd.DataFrame(ngram_counts.A1, index=ngram_features, columns=['count'])

    # Sort the DataFrame by count in descending order and return the top_n results
    return ngram_df.sort_values(by='count', ascending=False).head(top_n)

# Prepare comments for N-gram analysis (handling potential NaNs)
comments = df['Comment'].astype(str).dropna()

# --- 4a. Top 20 Bigrams ---
bigrams = get_ngrams(comments, n=2, top_n=20)
print("\nTop 20 Bigrams:")
print(tabulate(bigrams, headers=bigrams.columns, tablefmt='pipe'))

# Bar plot for bigrams
fig_bigrams = px.bar(
    bigrams,
    x=bigrams.index,
    y='count',
    title='Top 20 Bigrams',
    labels={'x': 'Bigrams', 'count': 'Count'}
)
fig_bigrams.show()

# --- 4b. Top 20 Trigrams ---
trigrams = get_ngrams(comments, n=3, top_n=20)
print("\nTop 20 Trigrams:")
print(tabulate(trigrams, headers=trigrams.columns, tablefmt='pipe'))

# Bar plot for trigrams
fig_trigrams = px.bar(
    trigrams,
    x=trigrams.index,
    y='count',
    title='Top 20 Trigrams',
    labels={'x': 'Trigrams', 'count': 'Count'}
)
fig_trigrams.show()


Top 20 Bigrams:
|                 |   count |
|:----------------|--------:|
| good quality    |     289 |
| good product    |     229 |
| chia seed       |     127 |
| quality good    |      83 |
| nice product    |      80 |
| flax seed       |      75 |
| quality product |      73 |
| good taste      |      62 |
| seed good       |      61 |
| value money     |      59 |
| product good    |      57 |
| quality seed    |      54 |
| taste good      |      46 |
| pumpkin seed    |      43 |
| bad quality     |      41 |
| good health     |      40 |
| sunflower seed  |      38 |
| really good     |      35 |
| poor quality    |      33 |
| seed fresh      |      31 |



Top 20 Trigrams:
|                      |   count |
|:---------------------|--------:|
| good quality product |      35 |
| good quality seed    |      26 |
| seed good quality    |      20 |
| organic flax seed    |      12 |
| quality chia seed    |      12 |
| good quality taste   |      11 |
| sorich organic flax  |      10 |
| element chia seed    |      10 |
| true element chia    |      10 |
| quality flax seed    |       9 |
| product good quality |       9 |
| omega fatty acid     |       8 |
| quality seed good    |       8 |
| good quality good    |       8 |
| chia seed good       |       8 |
| seed quality good    |       7 |
| good product nice    |       7 |
| product quality good |       7 |
| flax seed good       |       7 |
| good value money     |       7 |
