<a href="https://colab.research.google.com/github/Stephanie9606/zeno_khc_dash_nlp/blob/main/KHC_News_JanAug_Key_Sent_0915.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **KHC Dashboard Pulsar Data Testing File - Sentiment + Keywords**

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data cleaning

--- KHC Data Jan - Jun 2023 ---

In [None]:
khc_df_16 = pd.read_excel('/content/drive/MyDrive/Zeno_KHC_News/khc_H1_newsblogs_clean_nodirt.xlsx')

In [None]:
# only keep 'news' (rm blogs)
khc_df_16 = khc_df_16[khc_df_16['source'] == 'Online News']

--- KHC Data July - Aug 2023 ---

In [None]:
khc_df_78 = pd.read_excel('/content/drive/MyDrive/Zeno_KHC_News/khc_news_JulyAug.xlsx')

In [None]:
# merge two data
khc_df = pd.concat([khc_df_16, khc_df_78], axis=0)

In [None]:
# check num of rows
khc_df.shape[0]

16297

In [None]:
# rm rows with duplicated contents
khc_df = khc_df[~khc_df.duplicated(subset='content', keep=False)]

In [None]:
khc_df.shape[0]

13006

In [None]:
# keep only relevant columns
columns_to_keep = [0, 1, 2, 5, 6, 9, 14, 18]
khc_df_news = khc_df.iloc[:, columns_to_keep]

In [None]:
# run pulsar query again, extract relevant content (security -> food security, too many noise) (Delete "Heinz" in the future)
keywords = ["Kraft Heinz", "KHC", "Kraft Mac and Cheese", "Heinz", "Kraft Singles", "Miracle Whip", "Heinz Ketchup", "Lunchables", "Velveeta"]
topics = ["supply chain", "supply", "food security", "supplier", "recycled", "recycle", "recycling", "single-use", "plastic", "package", "packaging", "waste", "environment", "nature", "product", "quality", "planet", "crisis", "challenge", "problem", "criticism", "reputation"]

mask = khc_df_news['content'].str.contains('|'.join(keywords), case=False) & khc_df_news['content'].str.contains('|'.join(topics), case=False)

khc_df_news = khc_df_news[mask]

In [None]:
khc_df_news.shape[0]

12652

--- rm content with high similarity: 90% ---

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Calculate TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(khc_df_news['content'])

# Calculate cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Set a threshold for similarity
threshold = 0.9

# Create a list to track which rows to remove
rows_to_remove = []

# Iterate through the cosine similarity matrix
for i in range(len(khc_df_news)):
    for j in range(i + 1, len(khc_df_news)):
        if cosine_sim[i][j] >= threshold:
            # Check if both indices exist in the DataFrame
            if i in khc_df_news.index and j in khc_df_news.index:
                rows_to_remove.append(j)

# Remove duplicate indices from the list
rows_to_remove = list(set(rows_to_remove))

# Filter the DataFrame to remove rows with similar content
khc_df_news = khc_df_news.drop(index=rows_to_remove).reset_index(drop=True)

In [None]:
khc_df_news.shape[0]

5660

In [None]:
khc_df_news.head(20)

In [None]:
khc_df_news.tail(20)

In [None]:
# save to new excel file
khc_df_news.to_excel('/content/drive/MyDrive/Zeno_KHC_News/khc_news_JanAug_clean.xlsx', index=False)

# Directly read cleaned file

In [None]:
# 5660 rows
khc_df_news = pd.read_excel('/content/drive/MyDrive/Zeno_KHC_News/khc_news_JanAug_clean.xlsx')

# Keywords: KeyBert

--- every 3000 rows takes approximately  mins ---

In [None]:
# split into 2 df, each data has about 3000 rows
khc_df_n1 = khc_df_news.iloc[:3000,:]

In [None]:
khc_df_n2 = khc_df_news.iloc[3000:,:]

In [None]:
#!pip3 install keybert
from keybert import KeyBERT

In [None]:
#!pip3 install keyphrase-vectorizers
#from keyphrase_vectorizers import KeyphraseCountVectorizer

In [None]:
#!pip3 install transformers
# init default vectorizer
#vectorizer = KeyphraseCountVectorizer()
#print(vectorizer.get_params()) # print parameters

# specify model
kw_model = KeyBERT('all-MiniLM-L12-v2')
# specify number of keywords to extract
n_keywords = 5
ngram = 2  # specify ngram of keywords

In [None]:
# function to extract keywords for a batch of rows
def get_keywords_batch(rows):
    contents = rows['content'].tolist()
    keywords_batch = kw_model.extract_keywords(contents, stop_words='english', highlight=False, top_n=n_keywords, keyphrase_ngram_range=(1, ngram), diversity=0.2)
    return [[keyword[0] for keyword in keywords] for keywords in keywords_batch]

In [None]:
### change n1 and n2 df to proceed
# split the data into batches (higher effieciency)
batch_size = 100
batches = [khc_df_n1[i:i+batch_size] for i in range(0, len(khc_df_n1), batch_size)]

In [None]:
# extract keywords for each batch and concatenate the results
keyword_lists = []
for batch in batches:
    keyword_lists.extend(get_keywords_batch(batch))

In [None]:
### change n1 and n2 df to proceed
# add the keyword lists as a new column to the DataFrame
khc_df_n1['keywords'] = keyword_lists
# format to better fit in Excel
khc_df_n1['keywords'] = khc_df_n1['keywords'].apply(lambda x: ', '.join(x))

In [None]:
# merge n1 + n2
khc_news_key = pd.concat([khc_df_n1, khc_df_n2], axis=0)

In [None]:
khc_news_key.head()

In [None]:
khc_news_key.to_excel('/content/drive/MyDrive/Zeno_KHC_News/khc_news_JanAug_key5gram2_0920.xlsx', index=False)

# KeyBERT: Optimized with SpaCy (POS: noun, verb, adj) --> not improve

In [None]:
#!pip3 install --upgrade transformers
import transformers

keybert = KeyBERT('all-MiniLM-L12-v2')

In [None]:
vectorizer = KeyphraseCountVectorizer(spacy_pipeline='en_core_web_sm', pos_pattern='<J.*>*<N.*>+', stop_words='english', lowercase=True)

In [None]:
khc_test20 = khc_df_n1000.iloc[:20,:]

In [None]:
keybert_keywords = []
batch_size = 32
for idx in range(0, len(khc_test20), batch_size):
    batch = khc_test20['content'].iloc[idx: idx + batch_size]  # Extract the content column
    keywords_batch = keybert.extract_keywords(batch.tolist(), vectorizer=vectorizer, stop_words='english', top_n=10)
    keybert_keywords.extend(keywords_batch)

2023-08-25 21:17:56,188 - KeyphraseVectorizer - INFO - It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
INFO:KeyphraseVectorizer:It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
khc_test20['keybert'] = pd.Series([', '.join([keyword[0] for keyword in keywords]) for keywords in keybert_keywords])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  khc_test20['keybert'] = pd.Series([', '.join([keyword[0] for keyword in keywords]) for keywords in keybert_keywords])


In [None]:
khc_test20.iloc[:,[5,-1,-2]]

# KeyBERT Fine-tune

In [None]:
#!pip3 install datasets
# load the data
from datasets import load_dataset

# get entire dataset
ft_df = load_dataset("midas/inspec", "generation")

Repo card metadata block was not found. Setting CardData to empty.


In [None]:
print(type(ft_df))

<class 'datasets.dataset_dict.DatasetDict'>


In [None]:
ft_df_train = ft_df['train'].to_pandas()

In [None]:
# 1000 rows
ft_df_train.shape[0]

In [None]:
!pip3 install transformers
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from transformers import Trainer, TrainingArguments
import datasets

Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.32.1


In [None]:
### CHATGPT
import pandas as pd
from transformers import DistilRobertaTokenizer, DistilRobertaForSequenceClassification, Trainer, TrainingArguments

# Load your dataset (assuming you have a DataFrame with 'content' and 'keywords' columns)
# df = pd.read_csv('your_dataset.csv')

# Tokenizer and model
tokenizer = DistilRobertaTokenizer.from_pretrained('distilroberta-base')
model = DistilRobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=len(label_list))  # Assuming label_list is defined

# Tokenize and encode the data
encoded_data = tokenizer(
    list(df['content']),  # Assuming 'content' contains your text content
    list(df['keywords']),  # Assuming 'keywords' contains your keywords
    padding=True,
    truncation=True,
    max_length=128,  # Adjust this as needed
    return_tensors='pt',
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=8,
    evaluation_strategy='steps',
    eval_steps=500,  # Adjust as needed
    save_total_limit=5,
    num_train_epochs=3,  # Adjust as needed
    learning_rate=5e-5,  # Adjust as needed
    logging_dir='./logs',
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data,
    data_collator=None,  # You can customize data collation if needed
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model('./fine_tuned_model')


In [None]:
# after fine-tune
import pandas as pd
from transformers import DistilRobertaTokenizer, DistilRobertaForSequenceClassification

# Load the fine-tuned model and tokenizer
model_checkpoint = "./fine_tuned_model"  # Change to the path where your fine-tuned model is saved
tokenizer = DistilRobertaTokenizer.from_pretrained(model_checkpoint)
model = DistilRobertaForSequenceClassification.from_pretrained(model_checkpoint)

# Load the new DataFrame with the "content" column
new_df = pd.read_csv("new_data.csv")  # Change to the path of your new dataset CSV file

# Tokenize and encode the new data
encoded_data = tokenizer(
    list(new_df['content']),  # Assuming 'content' contains your new text content
    padding=True,
    truncation=True,
    max_length=128,  # Adjust this as needed
    return_tensors='pt',
)

# Generate keywords using the fine-tuned model
with torch.no_grad():
    outputs = model(**encoded_data)

# Extract the logits or probabilities as keywords
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)

# You can extract the keywords based on the probabilities, e.g., top-k keywords
top_k = 5  # Adjust the number of top keywords as needed
keywords = [tokenizer.convert_ids_to_tokens(token_ids) for token_ids in probabilities.topk(top_k).indices.tolist()]

# Add the keywords to the new DataFrame
new_df['keywords'] = [', '.join(keyword) for keyword in keywords]

# Print the updated DataFrame
print(new_df.head())

In [None]:
!pip3 install datasets
from datasets import load_dataset

dataset = load_dataset("midas/kptimes", "extraction")
label_list = np.unique([item for sublist in dataset['train']['doc_bio_tags'] for item in sublist])
label_dict = {label_list[0]: 1, label_list[1]: 2, label_list[2]: 0}

# Directly read data with keywords

In [None]:
khc_df_n1000 = pd.read_excel('/content/drive/MyDrive/Zeno/news_w_keywords/khc_H1_khc_n1000_kys.xlsx')

In [None]:
khc_df_n2000 = pd.read_excel('/content/drive/MyDrive/Zeno/news_w_keywords/khc_H1_khc_n2000_kys.xlsx')

In [None]:
khc_df_n3000 = pd.read_excel('/content/drive/MyDrive/Zeno/news_w_keywords/khc_H1_khc_n3000_kys.xlsx')

In [None]:
khc_df_n4000 = pd.read_excel('/content/drive/MyDrive/Zeno/news_w_keywords/khc_H1_khc_n4000_kys.xlsx')

In [None]:
khc_df_n5000 = pd.read_excel('/content/drive/MyDrive/Zeno/news_w_keywords/khc_H1_khc_n5000_kys.xlsx')

In [None]:
khc_df_n6000 = pd.read_excel('/content/drive/MyDrive/Zeno/news_w_keywords/khc_H1_khc_n6000_kys.xlsx')

In [None]:
# concat into a big one
khc_df_keys = pd.concat([khc_df_n1000, khc_df_n2000, khc_df_n3000, khc_df_n4000, khc_df_n5000, khc_df_n6000], axis=0)

khc_df_keys.reset_index(drop=True, inplace=True)

# Data Cleaning: rm 95% similarity context

In [None]:
khc_df_keys.shape[0]

5908

Remove content with high(%) similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Calculate TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(khc_df_keys['content'])

# Calculate cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Set a threshold for similarity
threshold = 0.95

# Create a list to track which rows to remove
rows_to_remove = []

# Iterate through the cosine similarity matrix
for i in range(len(khc_df_keys)):
    for j in range(i + 1, len(khc_df_keys)):
        if cosine_sim[i][j] >= threshold:
            rows_to_remove.append(j)

# Filter the DataFrame to remove rows with similar content
khc_df_keys_nosim = khc_df_keys.drop(index=rows_to_remove).reset_index(drop=True)

In [None]:
khc_df_keys_nosim.shape[0]

2883

In [None]:
khc_df_keys_nosim.iloc[:,[5,-1]]

In [None]:
# data without 95% similar content
khc_df_keys_nosim.to_excel('/content/drive/MyDrive/Zeno/khc_H1_0824.xlsx', index=False)

Extract previous and later 3 sentence based on keywords?

In [None]:
import re

keywords = ["Kraft Heinz", "KHC", "Kraft Mac and Cheese", "Kraft Singles", "Miracle Whip", "Heinz Ketchup", "Lunchables", "Velveeta", "supply chain", "supply", "security", "supplier", "recycled", "recycle", "recycling", "single-use", "plastic", "package", "packaging", "waste", "environment", "nature", "product", "quality", "planet", "crisis", "challenge", "problem", "criticism", "reputation", "image"]

# Directly read data with keywords and cleaned by removing 95% similarity context

In [None]:
khc_news_keys = pd.read_excel('/content/drive/MyDrive/Zeno/khc_H1_0824.xlsx')

In [None]:
khc_news_keys.shape[0]

2883

# Sum: BERTSum

In [None]:
test_khc_df_n1000 = khc_df_n1000.iloc[:10,:]

In [None]:
import pandas as pd
import tensorflow as tf
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load and preprocess the data
data = test_khc_df_n1000["content"].tolist()
batch_size = 8

input_data = [f"summarize: {item}" for item in data]
tokenized_data = tokenizer.batch_encode_plus(input_data, padding=True, truncation=True, return_tensors="pt", max_length=256)
input_ids = tokenized_data["input_ids"].numpy().tolist()
attention_mask = tokenized_data["attention_mask"].numpy().tolist()

dataloader = tf.data.Dataset.from_tensor_slices((input_ids, attention_mask)).batch(batch_size)

# Generate summaries and save to a new column
summaries = []

for batch in dataloader:
    batch_input_ids, batch_attention_mask = batch
    summary_ids = model.generate(tf.convert_to_tensor(batch_input_ids), attention_mask=tf.convert_to_tensor(batch_attention_mask), max_length=100, num_beams=4, early_stopping=True)
    batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)  # Directly decode summaries
    summaries.extend(batch_summaries)

# Limit summaries to 100 words
max_summary_length = 100
for i in range(len(summaries)):
    summary_words = summaries[i].split()
    if len(summary_words) > max_summary_length:
        summaries[i] = ' '.join(summary_words[:max_summary_length])

# Add summaries to the DataFrame
test_khc_df_n1000["summary"] = summaries


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BartTokenizer, TFBartForConditionalGeneration

# Load BART model and tokenizer
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
bart_model = TFBartForConditionalGeneration.from_pretrained(model_name)

# Load your DataFrame
# khc_df_n1000 = pd.read_csv("your_data.csv")

# Preprocess the content
texts = test_khc_df_n1000["content"].tolist()

# Batch size for processing
batch_size = 4
max_summary_length = 100  # Maximum length of the summary in tokens

# Calculate the number of batches
num_samples = len(texts)
num_batches = int(np.ceil(num_samples / batch_size))

# Initialize an empty list to store summaries
all_summaries = []

# Process the data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_samples)

    batch_texts = texts[start_idx:end_idx]
    batch_encoded_texts = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='tf', max_length=512)

    # Generate summaries with length constraint
    batch_summary_ids = bart_model.generate(**batch_encoded_texts, max_length=max_summary_length)

    batch_summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in batch_summary_ids]
    all_summaries.extend(batch_summaries)

# Add the summaries to the DataFrame
test_khc_df_n1000["summary"] = all_summaries

# Save the DataFrame with summaries
# khc_df_n1000.to_csv("your_processed_data_with_summaries.csv", index=False)


In [None]:
test_khc_df_n1000

# Sum: BART

In [None]:
#!pip3 install transformers
from transformers import BartTokenizer, BartForConditionalGeneration
import tensorflow as tf

In [None]:
model_name = "facebook/bart-large-xsum" #"facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
# define a function to generate summaries using BART for a batch of texts
def generate_summaries_batch(texts):
    inputs = tokenizer.batch_encode_plus(texts, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=100, min_length=30, length_penalty=2.0, num_beams=6, no_repeat_ngram_size=2, early_stopping=True)
    summaries = [tokenizer.decode(summary, skip_special_tokens=True) for summary in summary_ids]
    return summaries

In [None]:
khc_sum_n100 = khc_news_keys.iloc[:100,:]

In [None]:
khc_sum_n100.iloc[:,[5,-1]]

In [None]:
# batch size for processing multiple texts at once
batch_size = 4

# create batches of texts for summarization
text_batches = [khc_sum_n100["content"][i:i+batch_size].tolist() for i in range(0, len(khc_sum_n100), batch_size)]

# generate summaries for each batch and concatenate the results
summaries = []
for batch in text_batches:
    batch_summaries = generate_summaries_batch(batch)
    summaries.extend(batch_summaries)

In [None]:
# Add the summaries to the DataFrame
khc_sum_n100["bart_summary"] = summaries

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  khc_sum_n100["bart_summary"] = summaries


In [None]:
khc_sum_n100.iloc[:,[5,-1]]

In [None]:
# save to new excel file
khc_sum_n100.to_excel('/content/drive/MyDrive/Zeno/news_w_sum/khc_H1_khc_n100_BARTxsum.xlsx', index=False)

# Sentiment: Flair

--- Flair pre-trained model (non-labeled data) ---

--- 1000 rows takes approximately 20 mins ---

In [None]:
#!pip3 install flair
from flair.models import TextClassifier
from flair.data import Sentence
import multiprocessing

In [None]:
# load the classifier outside the function to avoid loading it multiple times
classifier = TextClassifier.load('en-sentiment')

In [None]:
# mapping from flair sentiment labels to preferred sentiment labels
flair_to_preferred_mapping = {
    'POSITIVE': 'positive',
    'NEGATIVE': 'negative',
    'NEUTRAL': 'neutral'
}

In [None]:
# function to predict sentiment and score for a list of texts
def predict_sentiments(texts):
    sentences = [Sentence(text) for text in texts]
    classifier.predict(sentences)

    predicted_sentiments = []
    sentiment_scores = []

    for sentence in sentences:
        if sentence.labels:
            sentiment = sentence.labels[0].value
            score = sentence.labels[0].score
        else:
            sentiment = 'NEUTRAL'
            score = 0.0

        predicted_sentiments.append(sentiment)
        sentiment_scores.append(score)

    return [
        (
            flair_to_preferred_mapping.get(sentiment, 'neutral'),
            score
        ) for sentiment, score in zip(predicted_sentiments, sentiment_scores)
    ]

In [None]:
# split the texts into chunks for parallel processing
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# number of CPU cores available
num_cores = multiprocessing.cpu_count()

In [None]:
# split the data into chunks for parallel processing
texts_list = list(chunks(khc_df_n4000['content'].tolist(), len(khc_df_n4000) // num_cores))

# init a pool of workers for multiprocessing
pool = multiprocessing.Pool(processes=num_cores)

# predict sentiments in parallel using multiprocessing and vectorized operations
predicted_sentiments_list = pool.map(predict_sentiments, texts_list)

# concatenate the results from all chunks into a single list
predicted_sentiments = [sentiment for sublist in predicted_sentiments_list for sentiment in sublist]

# assign the predicted sentiments and scores to df
khc_df_n4000['flair_sentiment'], khc_df_n4000['flair_sentiment_score'] = zip(*predicted_sentiments)

In [None]:
khc_df_n2000.tail()

In [None]:
khc_df_n4000.to_excel('/content/drive/MyDrive/Zeno/news_w_sentiment/khc_H1_khc_n4000_sentiment.xlsx', index=False)

# Check sentiment quality

In [None]:
# check unique output
khc_df_n1000['flair_sentiment'].unique()

array(['negative', 'positive'], dtype=object)

In [None]:
# check on neutral prediction
khc_df_n1000['flair_sentiment'].str.contains('neutral', case=False).any()

False

In [None]:
neutral_df = khc_df_n1000[khc_df_n1000['flair_sentiment'] == 'neutral']
neutral_df.head()

Unnamed: 0,id,search,source,application,title,content,date,parent,language,url,...,human rights,labor practices,community and society,workforce protection,local content,post subtype,post type,keywords,flair_sentiment,flair_sentiment_score


In [None]:
# check the score
low_df = khc_df_tw[khc_df_tw['flair_sentiment_score'] < 0.7]
low_df.shape[0]

182

In [None]:
low_df.iloc[:,[5,-2,-1]]

Unnamed: 0,content,flair_sentiment,flair_sentiment_score
14,Buffett praising $COST's Kirkland brand. He st...,positive,0.500105
17,It's your problem you took it any other way!...,negative,0.657860
43,Lord Cornwallis: {{short description|1955 film...,positive,0.610610
84,"Heinz say:Because of its natural acidity, Hei...",positive,0.603075
91,Today on Garret Lewis i spoke about happened a...,negative,0.690468
...,...,...,...
2921,Sometimes! If it is quality stuff.Not Heinz.,negative,0.527366
2950,This looks like a food security marker.Heinz ...,negative,0.562701
2954,"Freelance creative director, Elliott Starr, wa...",negative,0.637750
2988,For real. You'd have to challenge in court (...,negative,0.585367


--- if sentiment score < 0.7 --> set to neutral ---

In [None]:
#khc_df_tw.loc[khc_df_tw['flair_sentiment_score'] < 0.7, 'flair_sentiment'] = 'neutral'

# Difference between Pulsar and Flair sentiment

In [None]:
# Twitter
different_df = khc_df_n1000[khc_df_n1000['flair_sentiment'] != khc_df_n1000['sentiment class']]
different_df = different_df.iloc[:, [0,5,18,-2,-1]]
different_df

# Export the data to an excel file

In [None]:
# move 'keywords' ahead follow by 'content'
columns = khc_df_tw.columns.tolist()
columns.insert(6, columns.pop(-3))
khc_df_tw = khc_df_tw[columns]

In [None]:
khc_df_tw.head()

Unnamed: 0,id,search,source,application,title,content,keywords,date,parent,language,...,biodiversity,human rights,labor practices,community and society,workforce protection,local content,post subtype,post type,flair_sentiment,flair_sentiment_score
0,0_104815_8_1609539758241484801,104815,Twitter,Twitter for iPad,,"You mean ""Catsup."" Seriously, Hunts had a br...","ketchup, catsup, heinz trademarks, heinz, hunt...",2023-01-01 13:19:16,0_104815_8_1609425583464943624,en,...,,,,,,,reply,engagement,negative,0.759982
1,0_104815_8_1609589331206836224,104815,Twitter,Twitter for iPhone,,I smell a charity stream here…Each mileston...,"heinz beans, different heinz beans product, he...",2023-01-01 16:36:15,0_104815_8_1609588968877883393,en,...,,,,,,,reply,engagement,positive,0.952374
2,0_104815_8_1609705628557275136,104815,Twitter,Twitter for iPhone,,if anyone wants to get fired up ask them abou...,"heinz plastic, heinz, kit, plastic, anyone",2023-01-02 00:18:23,0,en,...,,,,,,,original post,post,positive,0.927865
3,0_104815_8_1609732418335899651,104815,Twitter,Twitter for Android,,"just so you know, I work in a supermarket and ...","plastic free packaging, heinz beans, packaging...",2023-01-02 02:04:50,0,en,...,,,,,,,original post,post,negative,0.988997
4,0_104815_8_1609742063314309120,104815,Twitter,Twitter Web App,,So hard to believe Velveeta Voldemort suppo...,"kevin mccarthy, velveeta voldemort, own party,...",2023-01-02 02:43:10,0_104815_8_1609735600881504257,en,...,,,,,,,reply,engagement,negative,0.937185


In [None]:
# save to new excel file
khc_df_tw.to_excel('/content/drive/MyDrive/Zeno/khc_H1_updated0804.xlsx', index=False)