In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the saved model and tokenizer
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
model_path = '/content/drive/My Drive/BERT_SurveySparrow_Model'
config = BertConfig.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path, config=config)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device('cpu')
model.to(device)

def predict_sentiment(query):
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    sentiment_score = probabilities[0][1].item()  # Probability of positive sentiment

    if sentiment_score > 0.6:
        return "Positive"
    elif sentiment_score < 0.4:
        return "Negative"
    else:
        return "Neutral"

def extract_keywords(query, num_keywords=5):
    # Tokenize the query
    tokens = word_tokenize(query.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token.isalnum()]

    # Get frequency distribution
    fdist = FreqDist(tokens)

    # Return the most common words
    return [word for word, _ in fdist.most_common(num_keywords)]

# Function to analyze a query
def analyze_query(query):
    sentiment = predict_sentiment(query)
    keywords = extract_keywords(query)
    return {
        "query": query,
        "sentiment": sentiment,
        "keywords": keywords
    }

In [7]:
# Load the dataset
df = pd.read_csv('/content/Agent_escalation.csv')

# Analyze each query in the dataset
results = []
for query in df['Query']:
    results.append(analyze_query(query))

# Create a new dataframe with the results
results_df = pd.DataFrame(results)

# Merge with the original dataset
final_df = pd.concat([df, results_df[['sentiment', 'keywords']]], axis=1)

# Display the first few rows of the final dataframe
print(final_df.head())

# Save the results
final_df.to_csv('sentimental_analyzed_dataset.csv', index=False)
print("Analysis complete. Results saved to 'sentimental_analyzed_dataset.csv'")

                                               Query            Escalation  \
0            Can I set an expiry date for my survey?  No escalation needed   
1  Could you explain how to how do i connect my g...     Escalation needed   
2  I need assistance with account management: wha...     Escalation needed   
3  I'm having trouble with pricing and billing: i...     Escalation needed   
4  Can you help me understand how to the survey i...     Escalation needed   

  Sentiment             Category sentiment  \
0  Positive      Data Collection  Negative   
1   Neutral         Integrations  Positive   
2  Positive   Account Management  Positive   
3  Negative  Pricing and Billing  Positive   
4   Neutral     Technical Issues  Positive   

                                           keywords  
0                       [set, expiry, date, survey]  
1      [could, explain, connect, google, analytics]  
2  [need, assistance, account, management, process]  
3     [billing, trouble, pricing, disc