<a href="https://colab.research.google.com/github/Stephanie9606/zeno_dashboard_nlp/blob/main/zeno_khc_twitter_ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **KHC Dashboard Pulsar Data Testing File - Sentiment + Keywords**

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Cleaning

--- KHC H1 Data Jan - Jun 2023 ---

In [None]:
khc_df = pd.read_excel('/content/drive/MyDrive/Zeno/khc_H1_twitter_nodirt.xlsx')

In [None]:
# delete web link at the end of content
khc_df['content'] = khc_df['content'].str.split('https').str[0].str.strip()

In [None]:
# check num of rows
khc_df.shape[0]

4185

In [None]:
khc_df_tw = khc_df

--- clean @name, but keep @khc related, remove emojis only content ---

In [None]:
import re
# tagged usernames to keep
allowed_names = ['@MiracleWhip', '@KraftHeinzCo', '@CheesyVelveeta'] # self define

# function to clean the content
def clean_content(row):
    # Use regex to replace the tagged names with empty strings
    cleaned_content = re.sub(rf"(?!{'|'.join(allowed_names)})@(\w+)", "", row["content"])
    return cleaned_content

# apply to df 'content' column
khc_df_tw["content"] = khc_df_tw.apply(clean_content, axis=1)

In [None]:
# rm empty content
khc_df_tw = khc_df_tw[khc_df_tw["content"] != ""]

In [None]:
#!pip3 install emoji==1.7.0

In [None]:
# rm content contains only emoji
import emoji
def contains_only_emojis(text):
    return all(char in emoji.UNICODE_EMOJI['en'] or char.isspace() for char in text)

# Filter and delete rows where the 'content' column contains only emojis
khc_df_tw = khc_df_tw[~khc_df_tw['content'].apply(contains_only_emojis)]

In [None]:
# rm rows with duplicated contents
khc_df_tw = khc_df_tw[~khc_df_tw.duplicated(subset='content', keep=False)]

In [None]:
khc_df_tw.shape[0]

2912

In [None]:
# run pulsar query again, extract relevant content (security -> food security, too many noise)
keywords = ["Kraft Heinz", "KHC", "Kraft Mac and Cheese", "Kraft Singles", "Miracle Whip", "Heinz", "Heinz Ketchup", "Lunchables", "Velveeta"]
topics = ["supply chain", "supply", "food security", "supplier", "recycled", "recycle", "recycling", "single-use", "plastic", "package", "packaging", "waste", "environment", "nature", "product", "quality", "planet", "crisis", "challenge", "problem", "criticism", "reputation", "image"]

mask = khc_df_tw['content'].str.contains('|'.join(keywords), case=False) & khc_df_tw['content'].str.contains('|'.join(topics), case=False)

khc_df_tw = khc_df_tw[mask]

In [None]:
khc_df_tw.head()

In [None]:
# save to new excel file
khc_df_tw.to_excel('/content/drive/MyDrive/Zeno/khc_H1_twitter_clean_nodirt.xlsx', index=False)

# Directly read cleaned file

In [None]:
khc_df_tw = pd.read_excel('/content/drive/MyDrive/Zeno/khc_H1_twitter_clean_nodirt.xlsx')

# Keywords: KeyBert

--- 2912 rows takes approximately 18 mins ---

In [None]:
#!pip3 install keybert
from keybert import KeyBERT

In [None]:
# enhance performance instead of n-gram
#!pip3 install keyphrase-vectorizers
#from keyphrase_vectorizers import KeyphraseCountVectorizer

In [None]:
# init default vectorizer
#vectorizer = KeyphraseCountVectorizer()
#print(vectorizer.get_params()) # print parameters

# specify model
kw_model = KeyBERT('all-MiniLM-L12-v2')
# specify number of keywords to extract
n_keywords = 10
ngram = 3  # specify ngram of keywords

In [None]:
# function to extract keywords for a batch of rows
def get_keywords_batch(rows):
    contents = rows['content'].tolist()
    keywords_batch = kw_model.extract_keywords(contents, stop_words='english', highlight=False, top_n=n_keywords, keyphrase_ngram_range=(1, ngram), diversity=0.2)
    return [[keyword[0] for keyword in keywords] for keywords in keywords_batch]

In [None]:
# split the data into batches (higher effieciency)
batch_size = 100
batches = [khc_df_tw[i:i+batch_size] for i in range(0, len(khc_df_tw), batch_size)]

In [None]:
# extract keywords for each batch and concatenate the results
keyword_lists = []
for batch in batches:
    keyword_lists.extend(get_keywords_batch(batch))

In [None]:
# add the keyword lists as a new column to the DataFrame
khc_df_tw['keywords'] = keyword_lists

In [None]:
# format to better fit in Excel
khc_df_tw['keywords'] = khc_df_tw['keywords'].apply(lambda x: ', '.join(x))

# Sentiment: Flair

--- Flair pre-trained model (non-labeled data) ---

--- 2912 rows takes approximately 4 mins ---

In [None]:
#!pip3 install flair
from flair.models import TextClassifier
from flair.data import Sentence
import multiprocessing

In [None]:
# load the classifier outside the function to avoid loading it multiple times
classifier = TextClassifier.load('en-sentiment')

In [None]:
# mapping from flair sentiment labels to preferred sentiment labels
flair_to_preferred_mapping = {
    'POSITIVE': 'positive',
    'NEGATIVE': 'negative',
    'NEUTRAL': 'neutral'
}

In [None]:
# function to predict sentiment and score for a list of texts
def predict_sentiments(texts):
    sentences = [Sentence(text) for text in texts]
    classifier.predict(sentences)

    predicted_sentiments = []
    sentiment_scores = []

    for sentence in sentences:
        if sentence.labels:
            sentiment = sentence.labels[0].value
            score = sentence.labels[0].score
        else:
            sentiment = 'NEUTRAL'
            score = 0.0

        predicted_sentiments.append(sentiment)
        sentiment_scores.append(score)

    return [
        (
            flair_to_preferred_mapping.get(sentiment, 'neutral'),
            score
        ) for sentiment, score in zip(predicted_sentiments, sentiment_scores)
    ]

In [None]:
# split the texts into chunks for parallel processing
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# number of CPU cores available
num_cores = multiprocessing.cpu_count()

In [None]:
# split the data into chunks for parallel processing
texts_list = list(chunks(khc_df_tw['content'].tolist(), len(khc_df_tw) // num_cores))

# init a pool of workers for multiprocessing
pool = multiprocessing.Pool(processes=num_cores)

# predict sentiments in parallel using multiprocessing and vectorized operations
predicted_sentiments_list = pool.map(predict_sentiments, texts_list)

# concatenate the results from all chunks into a single list
predicted_sentiments = [sentiment for sublist in predicted_sentiments_list for sentiment in sublist]

# assign the predicted sentiments and scores to df
khc_df_tw['flair_sentiment'], khc_df_tw['flair_sentiment_score'] = zip(*predicted_sentiments)

In [None]:
khc_df_tw.head()

# Check sentiment quality

In [None]:
# check unique output
khc_df_tw['flair_sentiment'].unique()

array(['negative', 'positive'], dtype=object)

In [None]:
# check on neutral prediction
khc_df_tw['flair_sentiment'].str.contains('neutral', case=False).any()

False

In [None]:
low_df.iloc[:,[5,-2,-1]]

Unnamed: 0,content,flair_sentiment,flair_sentiment_score
77,"Also a word of advice, if you're gonna try l...",positive,0.620976
135,That first pose...🔥😋,negative,0.568740
177,Buffett praising $COST's Kirkland brand. He st...,positive,0.500105
193,It's your problem you took it any other way!...,negative,0.657860
426,See More at: Hersmiles Store Find more designs...,positive,0.657918
...,...,...,...
35526,Sometimes! If it is quality stuff.Not Heinz.,negative,0.527366
36312,This looks like a food security marker.Heinz ...,negative,0.562701
36365,"Freelance creative director, Elliott Starr, wa...",negative,0.637750
36949,For real. You'd have to challenge in court (...,negative,0.585367


In [None]:
# if sentiment score < 0.7 --> set to neutral
#khc_df_tw.loc[khc_df_tw['flair_sentiment_score'] < 0.7, 'flair_sentiment'] = 'neutral'

# Sentiment: TwitterBERT

In [None]:
khc_df_tw = pd.read_excel('/content/drive/MyDrive/Zeno/khc_H1_twitter_0808.xlsx')

In [None]:
khc_twbert_test = khc_df_tw.iloc[:100,:]

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
sentiments = []  # Initialize an empty list to store sentiment analysis results
for text in khc_twbert_test['content']:  # Loop through each text in the 'content' column of the DataFrame
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    predicted_sentiment = int(scores.argmax())  # Convert the index of the max score to the sentiment label
    sentiments.append(predicted_sentiment)

# Add the sentiment analysis results to a new column in the DataFrame
khc_twbert_test['twbert_sentiment'] = sentiments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  khc_twbert_test['twbert_sentiment'] = sentiments


In [None]:
sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Use numpy's vectorized operations to map values
khc_twbert_test['twbert_sentiment1'] = np.vectorize(sentiment_map.get)(khc_twbert_test['twbert_sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  khc_twbert_test['twbert_sentiment1'] = np.vectorize(sentiment_map.get)(khc_twbert_test['twbert_sentiment'])


In [None]:
# BATCHBATCHBATCH
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

batch_size = 16  # Adjust the batch size as needed
texts = khc_twbert_test['content'].tolist()

sentiments = []  # Initialize an empty list to store sentiment analysis results

for batch_start in range(0, len(texts), batch_size):
    batch_texts = texts[batch_start:batch_start + batch_size]
    encoded_inputs = tokenizer(batch_texts, return_tensors='tf', padding=True, truncation=True)
    output = model(encoded_inputs)
    scores = output.logits
    scores = tf.nn.softmax(scores, axis=1)
    predicted_sentiments = tf.argmax(scores, axis=1).numpy()  # Convert to numpy array
    sentiments.extend(predicted_sentiments)

# Add the sentiment analysis results to a new column in the DataFrame
khc_twbert_test['twbert_sentiment'] = sentiments

sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Use numpy's vectorized operations to map values
khc_twbert_test['twbert_sentiment1'] = np.vectorize(sentiment_map.get)(khc_twbert_test['twbert_sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  khc_twbert_test['twbert_sentiment'] = sentiments
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  khc_twbert_test['twbert_sentiment1'] = np.vectorize(sentiment_map.get)(khc_twbert_test['twbert_sentiment'])


In [None]:
khc_twbert_test.iloc[:, [5, 19, -1]]

Unnamed: 0,content,sentiment class,twbert_sentiment1
0,"You mean ""Catsup."" Seriously, Hunts had a br...",neutral,Negative
1,I smell a charity stream here…Each mileston...,positive,Negative
2,if anyone wants to get fired up ask them abou...,neutral,Neutral
3,"just so you know, I work in a supermarket and ...",positive,Negative
4,So hard to believe Velveeta Voldemort suppo...,negative,Neutral
...,...,...,...
95,Right! I know that paw patrol Kraft mac and c...,negative,Negative
96,"The Kraft Heinz Company, Microsoft, Amazon, an...",neutral,Negative
97,good morning I have not slept and I wish to se...,negative,Neutral
98,Kraft-heinz and their product wants to see an...,negative,Neutral


# Difference between Pulsar and Flair sentiment

In [None]:
# Twitter
different_df = khc_df_tw[khc_df_tw['flair_sentiment'] != khc_df_tw['sentiment class']]
different_df = different_df.iloc[:, [0,5,18,-2,-1]]
different_df

In [None]:
different_df1 = khc_twbert_test[khc_twbert_test['twbert_sentiment1'] != khc_twbert_test['sentiment class']]
different_df1 = different_df1.iloc[:, [5,18, -1]]
different_df1

# Export the data to an excel file

In [None]:
# move 'keywords' ahead follow by 'content'
columns = khc_df_tw.columns.tolist()
columns.insert(6, columns.pop(-3))
khc_df_tw = khc_df_tw[columns]

In [None]:
khc_df_tw.head()

In [None]:
# save to new excel file
khc_df_tw.to_excel('/content/drive/MyDrive/Zeno/khc_H1_twitter_0808.xlsx', index=False)