In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

# Feature Engineering

In [44]:
# Function to convert string items to ints
def convert_list_of_strings_to_ints(lst):
    return [int(item) for item in lst]

# Convert a string into the list contained within the string
def safe_literal_eval(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError):
        return None  

data = pd.read_csv('encoding_data.csv', encoding='utf-8', converters={
    'Phrases': lambda x: literal_eval(x.replace(r"\\", "")),
    'Values': lambda x: convert_list_of_strings_to_ints(safe_literal_eval(x))
})

### Statistical Features of Each Sentence 

In [2]:
import statistics

# Function to extract features from each sentence such as 
    # Mean of confidence scores and differences between original confidence score and score with phrase/word missing
    # std of confidence scores and differences between original confidence score and score with phrase/word missing
    # Max of confidence scores and differences between original confidence score and score with phrase/word missing
    # Min of confidence scores and differences between original confidence score and score with phrase/word missing
    # Range of confidence scores and differences between original confidence score and score with phrase/word missing
    # Amount of phrases/words that when removed have a confidence difference above 0.3, 0.2, 0.1
    # Amount of phrases/words that when removed have a confidence difference below -0.1, -0.2, -0.3
    # Amount of phrases/words that when removed have a confidence difference between 0.5 and -0.5
    # Number of words in a sentence
    # Number of words/phrases that when removed cause the sentiment label of the sentence to change
def get_features(row):
    differences = []
    confidence_scores = []
    amount_change = 0
    
    sent_results = sentiment_analyzer(row['Sentence'])
    sent_label = sent_results[0]['label']
    sent_score = sent_results[0]['score']
    
    above_0_05 = 0
    above_0_1 = 0
    above_0_2 = 0
    above_0_3 = 0
    
    below_0_05 = 0
    below_0_1 = 0
    below_0_2 = 0
    below_0_3 = 0
    
    between_0_05  = 0

    for phrase in row['Phrases']:
        new_sent = row['Sentence']
        new_sent = new_sent.replace(phrase, "")
        results = sentiment_analyzer(new_sent)
        new_score = results[0]['score']
        new_label = results[0]['label']
        difference = sent_score - new_score
        if new_label == sent_label:
            differences.append((sent_score - new_score))
            confidence_scores.append(new_score)
        else:
            amount_change += 1
            
        if difference > 0.3:
            above_0_3 += 1
        elif difference > 0.2:
            above_0_2 += 1
        elif difference > 0.1:
            above_0_1 += 1
        elif difference > 0.05:
            above_0_05 += 1
        elif difference < -0.3:
            below_0_3 += 1
        elif difference < -0.2:
            below_0_2 += 1
        elif difference < -0.1:
            below_0_1 += 1
        elif difference < -0.05:
            below_0_05 += 1
        else:
            between_0_05 += 1
            
            
    num_words = len(row['Sentence'].split(' '))
    
    row['word_number'] = num_words
    row['mean_confidence_difference'] = statistics.mean(differences) if differences else 0
    row['max_confidence_difference'] = max(differences) if differences else 0
    row['min_confidence_difference'] = min(differences) if differences else 0
    row['range_confidence_difference'] = (max(differences) - min(differences)) if differences else 0
    row['std_confidence_difference'] = statistics.stdev(differences) if len(differences) > 1 else 0
    
    row['mean_confidence'] = statistics.mean(confidence_scores) if confidence_scores else 0
    row['max_confidence'] = max(confidence_scores) if confidence_scores else 0
    row['min_confidence'] = min(confidence_scores) if confidence_scores else 0
    row['range_confidence'] = (max(confidence_scores) - min(confidence_scores)) if confidence_scores else 0
    row['std_confidence'] = statistics.stdev(confidence_scores) if len(confidence_scores) > 1 else 0
    
    row['original_confidence'] = sent_score
    row['original_label'] = sent_label
    
    row['amount_label_switched'] = amount_change
    
    row['above_0.05'] = above_0_05
    row['above_0.1'] = above_0_1
    row['above_0.2'] = above_0_2
    row['above_0.3'] = above_0_3
    row['below_-0.05'] = below_0_05
    row['below_-0.1'] = below_0_1
    row['below_-0.2'] = below_0_2
    row['below_-0.3'] = below_0_3
    row['between_+-0.05'] = between_0_05
    
    print(row['Unnamed: 0'])
    return row
    

In [None]:
# Extract statistical features from the data
data = data.apply(lambda x: get_features(x), axis=1)

In [96]:
#data.to_csv('data_w_features.csv')

### Confidence Scores and Differences for Each Phrase 

In [101]:
expanded_rows = []

# Create a new row for each phrase/word in a sentence
for _, row in data.iterrows():
    for phrase, value in zip(row['Phrases'], row['Values']):
        # Create a copy of the row for each phrase
        new_row = row.copy()
        # Replace the Phrases and Values columns with the single phrase and corresponding value
        new_row['Phrase'] = phrase
        new_row['Value'] = value
        expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

In [103]:
#expanded_df.to_csv('data_w_features_expanded.csv')

In [3]:
expanded_df = pd.read_csv('data_w_features_expanded.csv')

In [10]:
# Make sure each phrase to a string
expanded_df['Phrase'] = expanded_df['Phrase'].apply(lambda x: str(x))

In [11]:
# Function to extract features from each phrase/word such as
    # Difference between the original confidence score and the confidence score of the sentence when said phrase/word is removed
    # New label
    # New confidence score when phrase/word is removed
    # Whether or not the label switched when the phrase/word was removed
def get_confidence(row):
    sent_score = row['original_confidence']
    new_sent = row['Sentence']
    new_sent = new_sent.replace(row['Phrase'], "")
    results = sentiment_analyzer(new_sent)
    new_score = results[0]['score']
    new_label = results[0]['label']
    
    row['new_confidence_score'] = new_score
    row['new_confidence_label'] = new_label
    row['confidence_difference'] = sent_score - new_score
    if new_label == row['original_label']:
        row['switched'] = 0
    else:
        row['switched'] = 1
    print(row['Unnamed: 0'])
    return row

In [None]:
# Extract new features from each phrase
expanded_df = expanded_df.apply(lambda x: get_confidence(x), axis=1)

In [13]:
expanded_df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Sentence', 'Sentence Encodings',
       'Phrases', 'Values', 'word_number', 'mean_confidence_difference',
       'max_confidence_difference', 'min_confidence_difference',
       'range_confidence_difference', 'std_confidence_difference',
       'mean_confidence', 'max_confidence', 'min_confidence',
       'range_confidence', 'std_confidence', 'original_confidence',
       'original_label', 'amount_label_switched', 'above_0.05', 'above_0.1',
       'above_0.2', 'above_0.3', 'below_-0.05', 'below_-0.1', 'below_-0.2',
       'below_-0.3', 'between_+-0.05', 'Phrase', 'Value',
       'new_confidence_score', 'new_confidence_label', 'confidence_difference',
       'switched'],
      dtype='object')

In [14]:
# Mapping to turn string data into ints
label_mapping = {'POS' : 0, 'NEG' : 1, 'NEU' : 2}

expanded_df['original_label'] = expanded_df['original_label'].apply(lambda x: label_mapping[x])
expanded_df['new_confidence_label'] = expanded_df['new_confidence_label'].apply(lambda x: label_mapping[x])

In [527]:
#expanded_df.to_csv('data_w_features_expanded.csv')

### Part of Speech for Each Word & Phrase 

In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download the required resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [518]:
# Function to find the POS of each word/word in each phrase
def part_of_speech_generator(row):
    
    # Tokenize the sentence in the 'Phrase' column of the row
    tokens = word_tokenize(row['Phrase'])
    
    # Generate POS tags for each token
    tagged_words = pos_tag(tokens)
    
    # Extract the POS tags using list comprehension
    row['POS_list'] = [word[1] for word in tagged_words]
    
    return row

In [519]:
# Clean the DataFrame 'Phrase' column
expanded_df['Phrase'] = expanded_df['Phrase'].fillna('').astype(str)

# Apply the part_of_speech_generator function to each row of the DataFrame
expanded_df = expanded_df.apply(part_of_speech_generator, axis=1)

In [520]:
# Flatten all lists in the 'POS' column into a single list
flattened_list = [pos_tag for sublist in expanded_df['POS_list'] for pos_tag in sublist]

# Convert the flattened list into a set to get unique POS tags
unique_pos_tags = set(flattened_list)

print(unique_pos_tags)

{',', 'JJ', 'PRP', 'RBR', 'VBZ', 'FW', '(', 'EX', ')', '.', 'VBG', 'MD', 'POS', 'PRP$', 'WP', 'NNS', 'CD', 'IN', 'NNPS', 'WRB', 'JJS', 'VBN', 'VB', 'RB', 'NN', 'TO', 'DT', 'CC', 'VBP', '$', "''", 'RBS', 'JJR', 'VBD', 'NNP', 'WDT', 'PDT', '``', ':'}


In [521]:
# Elements to remove
elements_to_remove = {',', '(', ')', '``', ':', '$', "''"}

# Remove multiple elements using difference_update
unique_pos_tags.difference_update(elements_to_remove)

print(unique_pos_tags)

{'JJ', 'PRP', 'RBR', 'VBZ', 'FW', 'EX', '.', 'VBG', 'MD', 'POS', 'PRP$', 'WP', 'NNS', 'CD', 'IN', 'NNPS', 'WRB', 'JJS', 'VBN', 'VB', 'RB', 'NN', 'TO', 'DT', 'CC', 'VBP', 'RBS', 'JJR', 'VBD', 'NNP', 'WDT', 'PDT'}


In [522]:
# Function used to one hot encode the POS for each phrase/word
def check_pos(row, pos):
    if pos in row['POS_list']:
        row[pos] = 1
    else:
        row[pos] = 0
    return row

In [523]:
# One hot encode each POS for each phrase
for pos in unique_pos_tags:
    expanded_df = expanded_df.apply(lambda x: check_pos(x, pos), axis=1)

# Model Training

In [867]:
import pandas as pd
import numpy as np
from ast import literal_eval
from transformers import pipeline

# Define sentiment analyser model
sentiment_analyzer = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [868]:
expanded_df = pd.read_csv('data_w_features_expanded.csv')

In [869]:
# Shuffle df to get rid of any influence on the model from the ordering of sentences
shuffled_df = expanded_df.sample(frac=1).reset_index(drop=True)

In [870]:
# Create list of feature columns
features = ['word_number', 'mean_confidence_difference',
       'max_confidence_difference', 'min_confidence_difference',
       'range_confidence_difference', 'std_confidence_difference',
       'mean_confidence', 'max_confidence', 'min_confidence',
       'range_confidence', 'std_confidence', 'original_confidence',
       'original_label', 'amount_label_switched', 'above_0.05', 'above_0.1',
       'above_0.2', 'above_0.3', 'below_-0.05', 'below_-0.1', 'below_-0.2',
       'below_-0.3', 'between_+-0.05',
       'new_confidence_score', 'new_confidence_label', 'confidence_difference',
       'switched', 'POS', 'JJ', 'PRP',
       'RBR', 'VBZ', 'FW', 'EX', '.', 'VBG', 'MD', 'PRP$', 'WP', 'NNS', 'CD',
       'IN', 'NNPS', 'WRB', 'JJS', 'VBN', 'VB', 'RB', 'NN', 'TO', 'DT', 'CC',
       'VBP', 'RBS', 'JJR', 'VBD', 'NNP', 'WDT', 'PDT']

In [871]:
# Define feature df and label df
x = shuffled_df[features]
y = shuffled_df['Value']

In [873]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [874]:
# Split dfs for training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [876]:
# Define a Scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [877]:
# Define a logistic regression model
model = LogisticRegression()

# Train the model using the scaled training data
model.fit(X_train_scaled, y_train)

In [878]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Define predictions for each word/phrase as being important to the sentiment or not
y_prob = model.predict_proba(X_test_scaled)[:, 1]


best_threshold = 0.0
best_f1 = 0.0  # Initialize to keep track of the highest F1 score
best_accuracy = 0.0  # To store the accuracy corresponding to the best F1 score
best_report = ""  # To store the classification report corresponding to the best F1 score

# Loop through thresholds from 0.0 to 1.0 with a step of 0.01
for threshold in range(0, 101):
    threshold /= 100  # Convert to float between 0.0 and 1.0

    y_pred_custom = (y_prob >= threshold).astype(int)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred_custom)
    report = classification_report(y_test, y_pred_custom, output_dict=True)
    
    # Extract F1 score for class 1
    f1_class_1 = report['1']['f1-score']

    # Check if the current F1 score is the highest
    if f1_class_1 > best_f1:
        best_f1 = f1_class_1
        best_threshold = threshold
        best_accuracy = accuracy
        best_report = classification_report(y_test, y_pred_custom)

# Print the best threshold and corresponding metrics
print(f"Best Threshold: {best_threshold}")
print(f"Accuracy at Best Threshold: {best_accuracy:.4f}")
print(f"F1 Score for Class 1 at Best Threshold: {best_f1:.4f}")
print(best_report)


Best Threshold: 0.41
Accuracy at Best Threshold: 0.8220
F1 Score for Class 1 at Best Threshold: 0.5887
              precision    recall  f1-score   support

           0       0.85      0.92      0.89       432
           1       0.68      0.52      0.59       141

    accuracy                           0.82       573
   macro avg       0.77      0.72      0.74       573
weighted avg       0.81      0.82      0.81       573



In [879]:
# Get the coefficients of the model
coefficients = model.coef_[0]

# Create a DataFrame to view the feature importance more clearly
feature_importance = pd.DataFrame({
    'Feature': x.columns,
    'Coefficient': coefficients
})

# Sort the features by the absolute value of their coefficient
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

print(feature_importance)


                        Feature  Coefficient  Absolute Coefficient
26                     switched     0.968311              0.968311
29                          PRP    -0.891549              0.891549
25        confidence_difference     0.500795              0.500795
48                           NN     0.487481              0.487481
12               original_label     0.446260              0.446260
24         new_confidence_label    -0.428841              0.428841
45                          VBN     0.368012              0.368012
47                           RB     0.354162              0.354162
21                   below_-0.3     0.345858              0.345858
28                           JJ     0.308801              0.308801
18                  below_-0.05     0.294600              0.294600
3     min_confidence_difference     0.284242              0.284242
35                          VBG     0.281971              0.281971
57                          WDT    -0.281383              0.28

## (FAILED- Might Utilize with More Training Data) Threshold Predictor Usinge Linear Regression

In [763]:
predictions = pd.DataFrame(y_prob)

In [764]:
shuffled_df['predictions'] = predictions

In [770]:
from sklearn.metrics import f1_score
import numpy as np

def find_optimal_threshold_for_f1(subset, predictions_column):
    best_threshold = 0.1  # Start with a default threshold of 0.1
    best_f1 = 0  # Track the best overall F1 score

    thresholds = np.arange(0, 1.01, 0.01)  # Thresholds from 0 to 1 in steps of 0.01
    for threshold in thresholds:
        predictions = (subset[predictions_column] >= threshold).astype(int)
        f1 = f1_score(subset['Value'], predictions, average='macro')  # Calculate F1 score
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            
    return best_threshold, best_f1

In [771]:
# Find the threshold that optimizes the f1 score for each sentence
for sentence in shuffled_df['Sentence'].unique():
    subset = shuffled_df[shuffled_df['Sentence'] == sentence]
    threshold, f1 = find_optimal_threshold(subset, 'predictions')
    shuffled_df.loc[shuffled_df['Sentence'] == sentence, 'Optimal_Threshold'] = threshold
    shuffled_df.loc[shuffled_df['Sentence'] == sentence, 'Best_f1'] = f1

In [772]:
shuffled_df.sort_values(by='Sentence')['Best_f1'].unique()

array([0.62820513, 0.5       , 0.54444444, 0.58152174, 0.62592593,
       0.61083744, 0.68478261, 0.5877193 , 0.57142857, 0.56666667,
       0.6       , 0.66666667, 0.51052632, 0.66296296, 0.53282828,
       0.58333333, 0.55384615, 0.75      , 0.72222222, 0.54511278,
       0.54166667, 0.58198381, 0.57619048, 0.57236842, 0.58653846,
       0.53571429, 0.55111111, 0.60606061, 0.55555556, 0.93181818,
       0.63636364, 0.55529954, 0.55387205, 0.6372549 , 0.63333333,
       0.66883117, 0.56149733, 0.60902256, 0.6659919 , 0.65      ,
       0.63387097, 0.57894737, 0.52545455, 0.57222222, 0.58730159,
       0.60833333, 0.64642857, 0.59285714, 0.72058824, 0.58516484,
       0.57368421, 0.54040404])

In [775]:
# Define feature columns
features_threshold_cols = ['word_number',
       'mean_confidence_difference', 'max_confidence_difference',
       'min_confidence_difference', 'range_confidence_difference',
       'std_confidence_difference', 'mean_confidence', 'max_confidence',
       'min_confidence', 'range_confidence', 'std_confidence',
       'original_confidence', 'original_label', 'amount_label_switched',
       'above_0.05', 'above_0.1', 'above_0.2', 'above_0.3', 'below_-0.05',
       'below_-0.1', 'below_-0.2', 'below_-0.3', 'between_+-0.05']

In [776]:
# Combine feature columns and label column
sentence_df = pd.concat([shuffled_df[features_threshold_cols], shuffled_df['Optimal_Threshold']], axis=1)

In [777]:
# Drop duplicates
sentence_df = sentence_df.drop_duplicates()

In [779]:
# Create feature df and label df
features_threshold_X = sentence_df[features_threshold_cols]
features_threshold_y = sentence_df['Optimal_Threshold']

In [780]:
from sklearn.feature_selection import mutual_info_regression

mi_scores = mutual_info_regression(features_threshold_X, features_threshold_y)

mi_scores_df = pd.DataFrame({
    'Feature': features_threshold_X.columns,
    'Mutual Information Score': mi_scores
}).sort_values(by='Mutual Information Score', ascending=False)

train_cols = list(mi_scores_df['Feature'])
print(mi_scores_df)

                        Feature  Mutual Information Score
8                min_confidence                  0.105349
20                   below_-0.2                  0.093282
16                    above_0.2                  0.074062
11          original_confidence                  0.060861
2     max_confidence_difference                  0.056648
6               mean_confidence                  0.036496
15                    above_0.1                  0.018154
4   range_confidence_difference                  0.008981
9              range_confidence                  0.008981
7                max_confidence                  0.007976
13        amount_label_switched                  0.003259
21                   below_-0.3                  0.000000
19                   below_-0.1                  0.000000
18                  below_-0.05                  0.000000
17                    above_0.3                  0.000000
0                   word_number                  0.000000
14            

In [781]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

for i in range(1, len(train_cols)):
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_threshold_X[train_cols[0:i]], features_threshold_y, test_size=0.2, random_state=42)

    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both the training and test data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Initialize the Linear Regression model
    model = LinearRegression()

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    print(f"Number of cols: {i}")
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.2f}")

    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared: {r2:.2f}")
    
    print('')


Number of cols: 1
Mean Squared Error: 0.08
R-squared: -0.16

Number of cols: 2
Mean Squared Error: 0.08
R-squared: -0.21

Number of cols: 3
Mean Squared Error: 0.08
R-squared: -0.13

Number of cols: 4
Mean Squared Error: 0.08
R-squared: -0.12

Number of cols: 5
Mean Squared Error: 0.08
R-squared: -0.13

Number of cols: 6
Mean Squared Error: 0.07
R-squared: -0.08

Number of cols: 7
Mean Squared Error: 0.08
R-squared: -0.13

Number of cols: 8
Mean Squared Error: 0.08
R-squared: -0.22

Number of cols: 9
Mean Squared Error: 0.08
R-squared: -0.22

Number of cols: 10
Mean Squared Error: 0.07
R-squared: -0.11

Number of cols: 11
Mean Squared Error: 0.08
R-squared: -0.13

Number of cols: 12
Mean Squared Error: 0.07
R-squared: -0.08

Number of cols: 13
Mean Squared Error: 0.06
R-squared: 0.08

Number of cols: 14
Mean Squared Error: 0.06
R-squared: 0.05

Number of cols: 15
Mean Squared Error: 0.06
R-squared: 0.06

Number of cols: 16
Mean Squared Error: 0.07
R-squared: 0.03

Number of cols: 17
Me

# Sentiment Analysis with Reasoning - Application of Model

In [880]:
import spacy
import nltk
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

def extract_phrases(text):
    doc = nlp(text)
    phrases = set()

    # Extract noun chunks
    for chunk in doc.noun_chunks:
        phrases.add(chunk.text)
    
    # Extract verb phrases using dependency parsing
    for token in doc:
        if token.pos_ == 'VERB':
            verb_phrase = ' '.join([child.text for child in token.children if child.dep_ in {'aux', 'neg', 'advmod'}] + [token.text])
            phrases.add(verb_phrase)
    
    # Extract additional phrases using patterns
    matcher = Matcher(nlp.vocab)
    patterns = [
        [{"POS": "ADJ"}, {"POS": "NOUN"}],  # Adjective + Noun
        [{"POS": "NOUN"}, {"POS": "NOUN"}],  # Noun + Noun
        [{"POS": "ADV"}, {"POS": "VERB"}],   # Adverb + Verb
    ]
    matcher.add("PhrasePatterns", patterns)
    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        phrases.add(span.text)

    return list(phrases)

def part_of_speech_generator(phrase, phrase_dict):
    
    tokens = word_tokenize(phrase)
    
    # Generate POS tags for each token
    tagged_words = pos_tag(tokens)
    
    # Extract the POS tags using list comprehension
    return [word[1] for word in tagged_words]
    
    
# Extract statistical features for each sentence
def get_features_sentence(sentence, phrases):
    row = {}
    differences = []
    confidence_scores = []
    amount_change = 0
    
    sent_results = sentiment_analyzer(sentence)
    sent_label = sent_results[0]['label']
    sent_score = sent_results[0]['score']
    
    above_0_05 = 0
    above_0_1 = 0
    above_0_2 = 0
    above_0_3 = 0
    
    below_0_05 = 0
    below_0_1 = 0
    below_0_2 = 0
    below_0_3 = 0
    
    between_0_05  = 0
    
    phrase_dict = {}
    
    for phrase in phrases:
        
        phrase_dict[phrase] = part_of_speech_generator(phrase, phrase_dict)
        
        new_sent = sentence
        new_sent = new_sent.replace(phrase, "")
        results = sentiment_analyzer(new_sent)
        new_score = results[0]['score']
        new_label = results[0]['label']
        difference = sent_score - new_score
        if new_label == sent_label:
            differences.append((sent_score - new_score))
            confidence_scores.append(new_score)
        else:
            amount_change += 1
            
        if difference > 0.3:
            above_0_3 += 1
        elif difference > 0.2:
            above_0_2 += 1
        elif difference > 0.1:
            above_0_1 += 1
        elif difference > 0.05:
            above_0_05 += 1
        elif difference < -0.3:
            below_0_3 += 1
        elif difference < -0.2:
            below_0_2 += 1
        elif difference < -0.1:
            below_0_1 += 1
        elif difference < -0.05:
            below_0_05 += 1
        else:
            between_0_05 += 1
                    
    num_words = len(sentence.split(' '))
    
    row['word_number'] = [num_words]
    row['mean_confidence_difference'] = [statistics.mean(differences) if differences else 0]
    row['max_confidence_difference'] = [max(differences) if differences else 0]
    row['min_confidence_difference'] = [min(differences) if differences else 0]
    row['range_confidence_difference'] = [(max(differences) - min(differences)) if differences else 0]
    row['std_confidence_difference'] = [statistics.stdev(differences) if len(differences) > 1 else 0]
    
    row['mean_confidence'] = [statistics.mean(confidence_scores) if confidence_scores else 0]
    row['max_confidence'] = [max(confidence_scores) if confidence_scores else 0]
    row['min_confidence'] = [min(confidence_scores) if confidence_scores else 0]
    row['range_confidence'] = [(max(confidence_scores) - min(confidence_scores)) if confidence_scores else 0]
    row['std_confidence'] = [statistics.stdev(confidence_scores) if len(confidence_scores) > 1 else 0]
    
    row['original_confidence'] = [sent_score]
    row['original_label'] = [1 if sent_label == 'POS' else 0]
    
    row['amount_label_switched'] = [amount_change]
    
    row['above_0.05'] = [above_0_05]
    row['above_0.1'] = [above_0_1]
    row['above_0.2'] = [above_0_2]
    row['above_0.3'] = [above_0_3]
    row['below_-0.05'] = [below_0_05]
    row['below_-0.1'] = [below_0_1]
    row['below_-0.2'] = [below_0_2]
    row['below_-0.3'] = [below_0_3]
    row['between_+-0.05'] = [between_0_05]
    return row, phrase_dict

# Extract features for each phrase/words effect on confidence score and label
def get_confidence(sentence, original_confidence, original_confidence_label, phrase_df, phrase):
    sent_score = original_confidence
    new_sent = sentence
    new_sent = new_sent.replace(phrase, "")
    results = sentiment_analyzer(new_sent)
    new_score = results[0]['score']
    new_label = results[0]['label']
    
    phrase_df['new_confidence_score'] = new_score
    phrase_df['new_confidence_label'] = [
        1 if new_label == 'POS' else 2 if new_label == 'NEG' else 0
    ]
    phrase_df['confidence_difference'] = sent_score - new_score
    if new_label == original_confidence_label:
        phrase_df['switched'] = 0
    else:
        phrase_df['switched'] = 1
    return phrase_df

In [927]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))


pos_tags = ['POS', 'JJ', 'PRP', 'RBR', 'VBZ', 'FW', 'EX', '.', 'VBG', 'MD', 'PRP$', 'WP',
       'NNS', 'CD', 'IN', 'NNPS', 'WRB', 'JJS', 'VBN', 'VB', 'RB', 'NN', 'TO',
       'DT', 'CC', 'VBP', 'RBS', 'JJR', 'VBD', 'NNP', 'WDT', 'PDT']

# Function that gives the sentiment of each sentence along with an explanation of the words 
# important to its sentiment using the model
def generate_explanation(sentence):
    sent_results = sentiment_analyzer(sentence)
    sent_label = sent_results[0]['label']
    sent_score = sent_results[0]['score']

    phrases = extract_phrases(sentence)
    words = sentence.split(' ')
    
    for word in words:
        if (word not in phrases) and (word.lower() not in stop_words):
            phrases.append(word)
    
    features, phrase_dict = get_features_sentence(sentence, phrases)
    
    important = ''
    amount = 0
    for phrase in phrases:
        phrase_df = pd.DataFrame(features)
        
        phrase_df = get_confidence(sentence, sent_score, sent_label, phrase_df, phrase)
        
        for pos in pos_tags:
            if pos in phrase_dict[phrase]:
                phrase_df[pos] = 1
            else:
                phrase_df[pos] = 0

        phrase_df_scaled = scaler.transform(phrase_df)
        
        
        pred = model.predict_proba(phrase_df_scaled)[:, 1]
        
        if pred >= 0.41:
            amount += 1
            important += '"'
            important += phrase
            important += '," '
        
    important = important[:-3] + important[-3 + 1:]
    important = important[:-1]
    if amount >= 2:
        important = important.split(" ")
        important.insert(len(important) + -1, 'and')
        important = " ".join(important)
        sing_plur = 'words/phrases'
    else:
        sing_plur = 'word/phrase'
            
    label_mapping = {'NEU' : 'neutral', 'POS' : 'positive', 'NEG' : 'negative'}
    explanation = f"This sentence is {label_mapping[sent_label]} because it contains the {sing_plur} {important}."
    return explanation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\17028\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\17028\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [928]:
new_sentences = pd.read_csv('pos_neg_sentences.csv')

In [929]:
sentence_list = list(new_sentences['Sentence'][100:500])

In [935]:
# Print the sentiment and important phrases/words for each sentence in sentence_list
for sentence in sentence_list:
    print(sentence)
    print("")
    explanation = generate_explanation(sentence)
    print(explanation)
    print("_____________________________________")
    print("")

The best AI image generator if you have a reference photo you'd like for the AI image generator to use as inspiration in either structure or style when rendering a new image. 

This sentence is positive because it contains the words/phrases "a reference photo," "new image," "a new image," "The best AI image generator," and "best".
_____________________________________

“Today, I am fired up to endorse Kamala Harris for President of the United States,” Whitmer wrote in a statement.

This sentence is positive because it contains the word/phrase "fired".
_____________________________________

And OpenAI has made a lot of advancements since then. For example: The paid version of ChatGPT includes capabilities for web search with real-time info in partnership with Bing, and plug-ins for travel companies allow users to access information about flights and more.

This sentence is positive because it contains the words/phrases "has made," "The paid version," "advancements," "real-time info," an

This sentence is negative because it contains the words/phrases "sickens," "has recently and found".
_____________________________________

By contrast, the new system can operate directly in speech without needing to lean on other models to prop it up, speeding up responses and allowing it to acknowledge quirks such as tone of voice.

This sentence is positive because it contains the words/phrases "allowing," "needing," "the new system," "responses," "speeding," "new," "lean," "models," and "up,".
_____________________________________

Listeria is especially dangerous to women who are pregnant, people 65 and older and those with weakened immune systems. The CDC says people in those categories should not eat meat sliced at any deli counter unless it is reheated to an internal temperature of 165 degrees Fahrenheit.

This sentence is negative because it contains the word/phrase "should not counter eat".
_____________________________________

Beyond the fact that voters had virtually no c

This sentence is positive because it contains the words/phrases "a lot," "support," "will bring," "bring," and "lot".
_____________________________________

If you don’t understand what that means, you are both not alone and in a safe space. Welcome to “Bratology 101.”

This sentence is positive because it contains the words/phrases "a safe space," "means," "safe space," and "safe".
_____________________________________

“I think we all support Vice President Harris,” he said. “I think when I look at it, both narratively and logistically, she’s the right fit for where we are.”

This sentence is positive because it contains the words/phrases "narratively ’s," "right fit," "he," "the right fit," "support," "think," "narratively," "logistically,," "right," and "fit".
_____________________________________

High-quality renditions, four images per prompt, and fun expressive chip feature

This sentence is positive because it contains the words/phrases "High-quality renditions," "expressive c

This sentence is negative because it contains the words/phrases "The deadly outbreak strain," "deadly outbreak," and "deadly".
_____________________________________

On Tuesday, Biden condemned the protesters’ use of the word “intifada,” as well as the group of Columbia University students who illegally occupied and barricaded a campus building. Biden also alluded to the campus turmoil in his proclamation this week marking Jewish American Heritage Month.

This sentence is negative because it contains the words/phrases "Columbia University students," and "condemned".
_____________________________________

The Kremlin has severely crippled the Russian opposition in recent years. Top figures are either in jail or in exile abroad, and the death last month of Alexei Navalny, who was Putin’s most vocal opponent, raised even more questions about what lies ahead for them.

This sentence is negative because it contains the words/phrases "has severely crippled," "even more questions," "severely 

This sentence is positive because it contains the words/phrases "helping," "a bounce," "advancing," "medical product," "also gets," "enhancing," "the new budget," "new and budget".
_____________________________________

Analysts at BMO Capital Markets also highlighted CRTO as the net beneficiary, adding that The Trade Desk (NASDAQ:TTD) also appears well-positioned with no deprecation.

This sentence is positive because it contains the words/phrases "adding," "well positioned," "also appears," "also highlighted," "net beneficiary," "no deprecation," "the net beneficiary," "also," "highlighted," "net," "beneficiary,," "(NASDAQ:TTD)," "well-positioned," and "deprecation.".
_____________________________________

The two deaths occurred in Illinois and New Jersey.

This sentence is negative because it contains the word/phrase "Jersey.".
_____________________________________

That's why it's hard to find Ukrainians these days who speak well of any Russians.

This sentence is negative because

This sentence is negative because it contains the word/phrase "are n't getting".
_____________________________________

“We’ve been thrown into an emotional tailspin,” is how Levin described President Joe Biden’s abrupt departure from the 2024 campaign this weekend and his quick endorsement of Vice President Kamala Harris. “What next? An enormous feeling of relief along with an enormous feeling of anxiety.”

This sentence is negative because it contains the words/phrases "abrupt departure," "anxiety," "President Joe Biden’s abrupt departure," "thrown," "tailspin,”," "next?," and "anxiety.”".
_____________________________________

Location is only one part of healthy living. Ultimately, it’s individual behaviors executed on a daily basis that have the most impact on getting and staying healthy. The best part is that you don’t have to do it alone. 

This sentence is positive because it contains the words/phrases "staying," "a daily basis," "the most impact," "individual behaviors," "do n

This sentence is negative because it contains the word/phrase "cannot".
_____________________________________

Beshear, 46, was elected governor in 2019 and recently won reelection in November in a five-point victory, a significant win for Democrats in a deeply red state. Beshear was also elected the state’s attorney general in 2015.

This sentence is positive because it contains the words/phrases "a significant win," "significant win," "also elected," "significant," and "win".
_____________________________________

“Obviously, it’s going to be a compressed period of time and Gov. Shapiro has an amazing record that’s worthy of consideration,” Bradford continued, “but again, today is about President Biden and his decades of service to our country.”

This sentence is positive because it contains the words/phrases "amazing record," "Obviously ’s going," "compressed period," "a compressed period," "an amazing record," "“Obviously,," "compressed," "amazing," and "continued,".
______________

This sentence is negative because it contains the word/phrase "sick".
_____________________________________

The penultimate demonstration was an impressive display of GPT-4o's linguistic abilities, as it simultaneously translated two languages – English and Italian – out loud. 

This sentence is positive because it contains the words/phrases "simultaneously loud translated," "The penultimate demonstration," "simultaneously translated," "an impressive display," "impressive display," "linguistic abilities," "impressive," "simultaneously," and "translated".
_____________________________________

Seven New Yorkers contracted listeria linked to sliced deli counter meat in a multistate outbreak that killed two and landed more than a dozen people in the hospital, health officials said Friday.

This sentence is negative because it contains the words/phrases "sliced deli counter meat," "sliced and deli".
_____________________________________

Presidential budget proposals are typically seen as

This sentence is negative because it contains the words/phrases "systemic torture," and "torture".
_____________________________________

Israeli forces seized the main border crossing between Egypt and southern Gaza earlier today, in what Israel and the US say is a limited offensive rather than a full-out offensive on the city.

This sentence is negative because it contains the words/phrases "seized," "Israeli and forces".
_____________________________________

OpenAI's popular ChatGPT was launched in Nov. 2022, and quickly amassed over 100 million users. ((Photo by Harun Ozalp/Anadolu Agency via Getty Images) / Getty Images)

This sentence is positive because it contains the words/phrases "quickly amassed," "over 100 million and users".
_____________________________________

Lexington, Kentucky, and Miami had the most significant jumps in ranking. The former gained 31 places since last year to claim the 49th spot, and the latter occupied the 13th position this year, an improvement of

This sentence is negative because it contains the words/phrases "controversial elements," "is not doing," "the most controversial and elements".
_____________________________________

NASA is celebrating the 25th anniversary of its Chandra X-ray Observatory launch by sharing never-before-seen photos of the largest known spiral galaxy in the universe. 

This sentence is positive because it contains the words/phrases "the largest known spiral galaxy," "is celebrating," "never before seen," "spiral galaxy," "known," "celebrating," and "largest".
_____________________________________

The full multimodal features with the ability to talk naturally using speech-to-speech are still being rolled out slowly, but even the chat version — conversing in text and pictures — is faster and more responsive than its predecessors.

This sentence is positive because it contains the words/phrases "multimodal features," "using," "its predecessors," "even the chat version," "naturally using," "are still slo

This sentence is positive because it contains the words/phrases "has already surpassed," "the 2023 to 2024 season," "already surpassed," "celebrating," "one term," "most presidents," "already," and "surpassed".
_____________________________________

President Joe Biden on Monday released a budget proposal aimed at getting voters' attention: It would offer tax breaks for families, lower health care costs, smaller deficits and higher taxes on the wealthy and corporations.

This sentence is positive because it contains the words/phrases "would offer," "lower health," "higher taxes," "smaller deficits," "aimed," "care costs," "released," "lower health care costs," "offer," "lower," "health," "smaller," and "higher".
_____________________________________

Signs and symptoms of listeria infection can vary. For intestinal illness, which usually starts within 24 hours after eating contaminated food and lasts around 1 to 3 days, symptoms include diarrhea and vomiting.

This sentence is negative

This sentence is positive because it contains the words/phrases "could signify," "significantly enhance," "driven," "away moving," "successfully launches," "new search," "search models," "this new search product," "user experience," "more integrated, AI-driven responses," "a significant shift," "internet searches," "OpenAI," "traditional search," "significant shift," "search product," "traditional search models," "successfully," "product,," "signify," "significant," "internet," "conducted—moving," "away," "integrated,," "significantly," and "enhance".
_____________________________________

It is not immediately clear whether the murdered man is Israeli, as some Egyptian outlets claim.

This sentence is negative because it contains the words/phrases "murdered," "the murdered man," "clear," and "Israeli,".
_____________________________________

"This protest is beautiful and we all here in the West, we can see people against Putin, but it doesn't change Putin's regime."

This sentence is

This sentence is positive because it contains the words/phrases "respect," and "beat".
_____________________________________

BIDEN, AT HOLOCAUST REMEMBRANCE CEREMONY, SAYS HATRED AGAINST JEWS BROUGHT TO LIFE BY HAMAS ATTACK

This sentence is negative because it contains the word/phrase "HATRED".
_____________________________________

Biden last week delivered brief remarks denouncing "chaos" on college campuses, saying that things had gone too far.

This sentence is negative because it contains the words/phrases "gone," and "far.".
_____________________________________

Which is exactly why, despite the challenges we face, we just took a big gulp and joined forces with the Center for Investigative Reporting, a team of ace journalists who create the amazing podcast and public radio show Reveal.

This sentence is positive because it contains the words/phrases "amazing podcast," "the amazing podcast," "joined," and "amazing".
_____________________________________

Microsoft and OpenAI ha

This sentence is positive because it contains the words/phrases "the right running mate," "running mate," "presidential nominee," "No one," "the presumptive Democratic presidential nominee," "right running," "better selecting," and "better".
_____________________________________

Lifehacker supports Group Black and its mission to increase greater diversity in media voices and media ownerships.

This sentence is positive because it contains the words/phrases "greater diversity," "Lifehacker," and "greater".
_____________________________________

Wirtfschafter referred to a soon-to-be-released text-to-video generative artificial intelligence model as the kind of technology that could pose danger in the upcoming election. 

This sentence is negative because it contains the words/phrases "upcoming election," "the upcoming election," "could pose," "kind," and "upcoming".
_____________________________________

Earlier this year, OpenAI dropped the requirement to sign up for accounts. On Mond

This sentence is negative because it contains the words/phrases "peaceful protests," ""not," and "peaceful".
_____________________________________

Rudaeff went out to join the battle for the kibbutz, and later sent a message that he had been hurt. He sent his love to his wife, Yaffa, and his four children, and since then, all communication was lost.

This sentence is negative because it contains the words/phrases "all communication," "lost," "hurt.," "then,," and "lost.".
_____________________________________

As you can see above, the images rendered of the Yorkies are high-quality, realistic, and detailed. Additionally, the biggest standout features of this chatbot are its Structure Reference and Style Reference features. 

This sentence is positive because it contains the words/phrases "standout features," "rendered," "biggest standout," "the biggest standout features," and "biggest".
_____________________________________

“They want people to feel like they have a voice in the sys

This sentence is negative because it contains the word/phrase .
_____________________________________

At least two people have died and 28 were hospitalized in an outbreak of listeria food poisoning linked to meat sliced at grocery store deli counters across a dozen states.

This sentence is negative because it contains the words/phrases "At least two people," and "least".
_____________________________________

“I will do everything in my power to unite the Democratic Party — and unite our nation — to defeat Donald Trump and his extreme Project 2025 agenda,” she said.

This sentence is positive because it contains the words/phrases "to defeat," "will do," and "defeat".
_____________________________________

The White House is also proposing further investments in the “Trusted Workforce 2.0” strategy, which aims to bolster cybersecurity and personnel vetting procedures governmentwide.

This sentence is positive because it contains the words/phrases "further investments," "procedures go

This sentence is positive because it contains the words/phrases "would boost," "market share," "captured," "would," and "boost".
_____________________________________

What’s next. Google hasn’t provided a specific timeline for the new approach, likely to avoid the pitfalls of previous delays.

This sentence is negative because it contains the words/phrases "has n’t likely provided," "hasn’t," and "provided".
_____________________________________

Context: There was a surge in police reform talks following the murder of George Floyd, a Black man who was killed by Minneapolis police in 2020. But the talks ultimately collapsed in September 2021, with negotiators saying their differences were simply too vast.

This sentence is negative because it contains the word/phrase .
_____________________________________

A multi-state listeria outbreak linked to deli meat has left 28 people hospitalized and two dead.

This sentence is negative because it contains the word/phrase .
_________________

This sentence is negative because it contains the word/phrase "warned".
_____________________________________

Remember: Earlier today, GOP Rep. Tim Burchett described Harris as a “DEI hire,” a comment characterized as racist and offensive by Congressional Black Caucus Chairman Steve Horsford.

This sentence is negative because it contains the word/phrase .
_____________________________________

The FBI Boston Division recovered 22 artifacts stolen from Japan, including the artwork above. During World War II, various treasures from the Ryukyu Kingdom were stolen.

This sentence is negative because it contains the words/phrases "stolen," "treasures," and "stolen.".
_____________________________________

Listeria symptoms usually start within two weeks of eating food contaminated with the bacteria, but may start as early as the same day or as late as 10 weeks after, the CDC said. Symptoms include fever, muscle aches, tiredness, headache, stiff neck, confusion, loss of balance and seizure