In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load customer queries data
data = pd.read_csv('customer_queries.csv')  # Assuming a CSV file with a "query" column
queries = data['query']

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Preprocess Text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalnum()]  # Remove punctuation
    return ' '.join(tokens)

data['cleaned_query'] = data['query'].apply(preprocess_text)

# Sentiment Analysis
data['sentiment'] = data['cleaned_query'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Visualize Sentiment Distribution
plt.hist(data['sentiment'], bins=20, color='skyblue')
plt.title('Sentiment Distribution of Customer Queries')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_query'])

# Topic Modeling with LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

# Display top words in each topic
for i, topic in enumerate(lda.components_):
    topic_words = [tfidf_vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]]
    print(f"Topic {i+1}: {' '.join(topic_words)}")

# Generate Word Cloud for Frequently Used Words
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(data['cleaned_query']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
