Large Language Models (LLMs) can be extremely useful for analyzing text data, including comments associated with cases, to extract common themes and patterns within a specified timeframe. LLMs provide a deep understanding of language context, sentiment, and thematic elements in text data

In [None]:
pip install transformers torch



In [None]:
pip install sentence-transformers



the Hugging Face Transformers library and PyTorch installed.

In [None]:
import re

def preprocess_text(text):
    """Clean and preprocess text data."""
    # to remove non-alphanumeric characters
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    # Lowercase the text
    text = text.lower()
    return text

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
# Define a function for clustering comments
def cluster_comments(comments, num_clusters=5):

  # Initialize the SentenceTransformer model with a pre-trained model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(comments)

    # Encode the comments into numerical vector representations
    clustering_model = KMeans(n_clusters=num_clusters)

    #Fitting the KMeans model to comment embeddings
    clustering_model.fit(embeddings)
    cluster_assignment = clustering_model.labels_
    return cluster_assignment


In [None]:
from transformers import pipeline

# Defining a function for sentiment analysis
def analyze_sentiment(comments):

  #creating a pipeling
    sentiment_pipeline = pipeline("sentiment-analysis")
    return sentiment_pipeline(comments)

In [None]:
def summarize_comments(comments):
    summarizer = pipeline("summarization")

    # Generate summaries for each comment in the input list
    summaries = [summarizer(comment)[0]['summary_text'] for comment in comments]
    return summaries

Took two comments as examples and tried the sentiment analysis and got the sentiment score

In [None]:
# Example comments
comments_list = [
    "This is a great product. I had a wonderful experience.",
    "Terrible service, will not be coming back!",
    # Add more comments as needed
]

# Preprocessing the  comments
preprocessed_comments = [preprocess_text(comment) for comment in comments_list]

# trying to find themes
num_clusters = 2  # Adjust based on your data
cluster_assignments = cluster_comments(preprocessed_comments, num_clusters=num_clusters)
print("Cluster Assignments:", cluster_assignments)

# Analyzing sentiment
sentiments = analyze_sentiment(comments_list)
print("Sentiments:", sentiments)



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Cluster Assignments: [0 1]
Sentiments: [{'label': 'POSITIVE', 'score': 0.9998854398727417}, {'label': 'NEGATIVE', 'score': 0.9991433620452881}]


Took a dataset of food reviews and tried out a few cases to check the score, I have only taken the comments from the dataset

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Upload the CSV file
# Run this cell and choose the CSV file from your local system using the file upload dialog
from google.colab import files
uploaded = files.upload()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving tastereview.csv to tastereview (2).csv


In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('tastereview.csv')

# Now you can work with the DataFrame
print(df.head())


   Unnamed: 0  recipe_number  recipe_code         recipe_name  \
0           0              1        14299  Creamy White Chili   
1           1              1        14299  Creamy White Chili   
2           2              1        14299  Creamy White Chili   
3           3              1        14299  Creamy White Chili   
4           4              1        14299  Creamy White Chili   

                                        comment_id         user_id  \
0  sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM  u_9iFLIhMa8QaG   
1  sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY  u_Lu6p25tmE77j   
2  sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP  u_s0LwgpZ8Jsqq   
3  sp_aUSaElGf_14299_c_2DzdSIgV9qNiuBaLoZ7JQaartoC  u_fqrybAdYjgjG   
4  sp_aUSaElGf_14299_c_2DtZJuRQYeTFwXBoZRfRhBPEXjI  u_XXWKwVhKZD69   

    user_name  user_reputation  created_at  reply_count  thumbs_up  \
0     Jeri326                1  1665619889            0          0   
1     Mark467               50  1665277687      

In [None]:
comments_list = df['text'].tolist()

In [None]:
# Preprocess comments
preprocessed_comments = [preprocess_text(comment) for comment in comments_list]

In [None]:
# Cluster comments to find themes
num_clusters = 2  # Adjust based on your data
cluster_assignments = cluster_comments(preprocessed_comments, num_clusters=num_clusters)
print("Cluster Assignments:", cluster_assignments)

Cluster Assignments: [0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 0]




In [None]:
# Analyze sentiment
sentiments = analyze_sentiment(comments_list)
print("Sentiments:", sentiments)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Sentiments: [{'label': 'POSITIVE', 'score': 0.9997697472572327}, {'label': 'NEGATIVE', 'score': 0.9997864365577698}, {'label': 'POSITIVE', 'score': 0.9967472553253174}, {'label': 'NEGATIVE', 'score': 0.9997155070304871}, {'label': 'POSITIVE', 'score': 0.9997637867927551}, {'label': 'POSITIVE', 'score': 0.9998810291290283}, {'label': 'POSITIVE', 'score': 0.999842643737793}, {'label': 'POSITIVE', 'score': 0.9991225600242615}, {'label': 'POSITIVE', 'score': 0.9981573224067688}, {'label': 'POSITIVE', 'score': 0.9991816878318787}, {'label': 'POSITIVE', 'score': 0.9995021820068359}, {'label': 'POSITIVE', 'score': 0.9876536726951599}, {'label': 'POSITIVE', 'score': 0.9992671608924866}, {'label': 'POSITIVE', 'score': 0.999502420425415}, {'label': 'POSITIVE', 'score': 0.9864811897277832}, {'label': 'POSITIVE', 'score': 0.9998772144317627}, {'label': 'POSITIVE', 'score': 0.9998201727867126}]


# New Section

Using the Dataset i tried to find a few key words

In [None]:
import numpy as np
import pandas as pd
df.head()

Unnamed: 0.1,Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text
0,0,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM,u_9iFLIhMa8QaG,Jeri326,1,1665619889,0,0,0,5,527,"I tweaked it a little, removed onions because ..."
1,1,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY,u_Lu6p25tmE77j,Mark467,50,1665277687,0,7,0,5,724,Not very good .It does not taste good at all.
2,2,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP,u_s0LwgpZ8Jsqq,Barbara566,10,1664404557,0,3,0,5,710,I have a very complicated white chicken chili ...
3,3,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DzdSIgV9qNiuBaLoZ7JQaartoC,u_fqrybAdYjgjG,jeansch123,1,1661787808,2,2,0,0,581,"very Bad taste , I did not like it at all"
4,4,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DtZJuRQYeTFwXBoZRfRhBPEXjI,u_XXWKwVhKZD69,camper77,10,1664913823,1,7,0,0,820,Wonderful! I made this for a &#34;Chili/Stew&#...


In [None]:
texts = pd.DataFrame( df,columns= ['text'])
texts

Unnamed: 0,text
0,"I tweaked it a little, removed onions because ..."
1,Not very good .It does not taste good at all.
2,I have a very complicated white chicken chili ...
3,"very Bad taste , I did not like it at all"
4,Wonderful! I made this for a &#34;Chili/Stew&#...
5,amazing! my boyfriend loved it so much! going ...
6,Wow!!! This recipe is excellent as written!! ...
7,This is delicious and I make it often. One suc...
8,I absolutely love this recipe. I&#39;ve tweake...
9,I make this a lot … my kids and there friends ...


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

removing stop words is a common preprocessing step in text analysis tasks, and it helps streamline the analysis process and improve the quality of the results.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')

#Stemming is the process of reducing words to their root or base form, which helps in reducing variations in word forms

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Tokenize, remove stopwords, and stem the comments
tokenized_comments = []
for text in df['text']:
    if isinstance(text, str):  # Checking if the value is a string
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word.lower()) for word in tokens if word.isalnum() and word.lower() not in stop_words]
        tokenized_comments.append(" ".join(tokens))
    else:
        tokenized_comments.append("")  # If value is not a string, append an empty string

# Add the tokenized comments back to the DataFrame
df['tokenized_comments'] = tokenized_comments


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#CountVectorizer from scikit-learn to convert a collection of text documents into a matrix of token counts.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tokenized_comments)

Latent Dirichlet Allocation (LDA) is used for unsupervised topic modeling, extracting hidden topics from a collection of documents without prior labeling, aiding in document understanding and organization.

In [None]:
# Performing  topic modeling (Latent Dirichlet Allocation)
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X)

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[-5:][::-1]  # Get indices of top 5 words
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
    topic_keywords.append(top_words)


In [None]:
# Printing the keywords for each topic
for i, keywords in enumerate(topic_keywords):
    print(f"Topic {i + 1} Keywords:", keywords)

Topic 1 Keywords: ['recip', '39', 'make', 'made', 'use']
Topic 2 Keywords: ['chili', 'recip', 'made', 'make', 'delici']
Topic 3 Keywords: ['recip', '39', 'make', 'made', 'use']
Topic 4 Keywords: ['chili', 'recip', 'made', 'make', 'delici']
Topic 5 Keywords: ['recip', '39', 'make', 'made', 'use']
Topic 6 Keywords: ['chili', 'recip', 'made', 'make', 'delici']
