# Data Visualization 
# Week 9: Text Data Visualization

In [None]:
# Import the required packages
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Please install NLTK library before use
# !pip install nltk

# 1.Text Preprocessing

## 1.1 Tokenization

### Sentence Tokenization

In [None]:
# The sentence below was taken from https://en.wikipedia.org/wiki/Natural_language_processing. 

sentence = "Natural language processing (NLP) is an interdisciplinary subfield of linguistics and computer science. It is primarily concerned with processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic machine learning approaches. The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves."

In [None]:
# Import the sentence tokenize function 

from nltk.tokenize import sent_tokenize

In [None]:
# Tokenizing a given text into individual sentences
tokenized_sent = sent_tokenize(sentence)

for i, sent in enumerate(tokenized_sent):
    print(i, ':', sent)

### Word Tokenization

In [None]:
# Import the word tokenize function 

from nltk.tokenize import word_tokenize

In [None]:
# Tokenizing a given text into individual words

tokenized_word = word_tokenize(sentence)
print(tokenized_word)

In [None]:
for i, word in enumerate(tokenized_word):
    print(i, ':', word)

## 1.2 Normalization

### Lower Casing the Data 

In [None]:
# Convert all the characters in the string to lowercase
lowercase_sent = sentence.lower()

In [None]:
lowercase_tokenized_word = word_tokenize(lowercase_sent)
print(lowercase_tokenized_word)

In [None]:
len(lowercase_tokenized_word)

## 1.3 Stopwords

In [None]:
# Import the stopwords module

from nltk.corpus import stopwords

In [None]:
# Load the English stopwords

stop_words=set(stopwords.words("english"))
print(stop_words)

In [None]:
#Removing stopwords

filtered_tokens = []

for i in lowercase_tokenized_word:    
    if i not in stop_words:
         filtered_tokens.append(i)

In [None]:
print(len(lowercase_tokenized_word))
print(len(filtered_tokens))

## 1.4 Removing punctuations

In [None]:
# Import the string module
import string

In [None]:
# Create a list of punctuations
punct = list(string.punctuation)
print(punct)

In [None]:
#Removing punctuations

filtered_tokens_v2 = []

for i in filtered_tokens:
    if i not in punct:
        filtered_tokens_v2.append(i)

In [None]:
print(len(lowercase_tokenized_word))
print(len(filtered_tokens))
print(len(filtered_tokens_v2))

In [None]:
print(lowercase_tokenized_word)
print()
print(filtered_tokens)
print()
print(filtered_tokens_v2)

## 1.5 Stemming
#### Stemming: It is a process of transforming a word to its root form.

In [None]:
# Import the PorterStemmer class from the NLTK library
from nltk.stem import PorterStemmer

In [None]:
# Stemming on a list of words
ps = PorterStemmer()

stemmed_words = []

for i in filtered_tokens_v2:     
     stemmed_words.append(ps.stem(i))

In [None]:
# Print the original word vs stemmed version
for w in filtered_tokens_v2:
    print(w, " : ", ps.stem(w))

In [None]:
print(stemmed_words)
print(len(stemmed_words))

## 1.6 Lemmatization

In [None]:
# Import the WordNetLemmatizer class from the NLTK library.
from nltk.stem import WordNetLemmatizer 

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens_v2]

In [None]:
print(lemmatized_words)
print(len(lemmatized_words))

## Part-of-Speech(POS) tagging 

In [None]:
# Perform part-of-speech tagging
tagged_word = nltk.pos_tag(lemmatized_words)
print(tagged_word)

# 2. Text Visualization

## 2.1 Frequency distribution

In [None]:
# Import the FreqDist class
from nltk.probability import FreqDist

In [None]:
freq_word = FreqDist(lemmatized_words)
print(freq_word)   #Could you please explain the result.

In [None]:
# Show Five most common words
freq_word.most_common(5)

In [None]:
# Plot the frequency distribution 
plt.figure(figsize = (6, 3))
freq_word.plot(25, cumulative = False)  # Plot the top 25 most common words.
plt.show()

In [None]:
freq_word = FreqDist(filtered_tokens_v2)
freq_word.most_common(5)

In [None]:
plt.figure(figsize = (4, 3))
freq_word.plot(10, cumulative = False)
plt.show()

## 2.2 Word Cloud

In [None]:
#Please install wordcloud library before use
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
# Create a string from the "lemmatized_words"
filtered_tokens_v3 = " ".join(lemmatized_words)

In [None]:
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 600,
                      background_color = 'white',
                      min_font_size = 8,
                      colormap = 'viridis').generate(filtered_tokens_v3)

In [None]:
# Display WordCloud 
#plt.figure(figsize=(5, 5), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()

# 3. Sentiment analysis

In [None]:
# !pip install textblob

In [None]:
# Import textblob
from textblob import TextBlob

In [None]:
# Example of textblob on sentiment analysis
review1 = "The pizza was delicious. The crust was perfectly crispy, and the toppings were generous. I'll definitely be coming back for more."
review2 = "I cannot believe how terrible the food was at this restaurant. The pizza had a cardboard-like crust, and the toppings tasted like they were old and stale. I've never had such a disappointing dining experience in my life. I wouldn't recommend this place to my worst enemy."


# Create a TextBlob object
blob1 = TextBlob(review1)
blob2 = TextBlob(review2)

# Perform sentiment analysis and print the results
print('Review1 :', blob1.sentiment)
print('Review2 :', blob2.sentiment)

In [None]:
# Dataset from https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018
df = pd.read_csv('drugsComTest_raw.csv') 
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna()
df.isna().sum()

In [None]:
# Filter the DataFrame by a string value ("Depression") in the "condition" column
filtered_df = df[df['condition'].str.contains('Depression')]
filtered_df.sample(5)

In [None]:
# Filter the DataFrame to only specific columns
filtered_df = filtered_df.loc[:, ['uniqueID', 'drugName', 'condition', 'review']]
filtered_df.sample(5)

In [None]:
filtered_df.info()

In [None]:
from nltk import pos_tag
import html

In [None]:
# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [None]:
# Function for text preprocessing

def preprocess_text(text):
    # Remove HTML entities
    cleaned_text = html.unescape(text)
    
    # Tokenization
    tokens = word_tokenize(cleaned_text)
    
    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in tokens if word.isalnum()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # Part-of-speech tagging
    pos_tags = pos_tag(lemmatized_words)
    
    # Filter nouns and adjectives
    filtered_words = [word for word, pos in pos_tags if pos.startswith('NN') or pos.startswith('JJ')]
    
    return " ".join(filtered_words)

In [None]:
# Apply the preprocessing function to the 'review' column
filtered_df['preprocessed_review'] = filtered_df['review'].apply(preprocess_text)

In [None]:
filtered_df.sample(5)

In [None]:
# Calculate the polarity of the preprocessed reviews using the TextBlob 
filtered_df['polarity'] = filtered_df['preprocessed_review'].apply(lambda x: (TextBlob(x).sentiment.polarity))
filtered_df.sample(5)

In [None]:
# Create 'get_sentiment' function that converts sentiment scores into sentiment labels. 

def get_sentiment(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

In [None]:
filtered_df['sentiment'] = filtered_df['polarity'].apply(get_sentiment)
filtered_df.sample(5)

In [None]:
# Count each polarity value
sentiment_counts = filtered_df['sentiment'].value_counts()

# Plot a bar chart
plt.figure(figsize=(5, 4))
sentiment_counts.plot(kind='bar', color='blue')
plt.xlabel("Sentiments")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

In [None]:
sentiments = ['Negative', 'Neutral', 'Positive']

for sentiment in sentiments:
    text = " ".join(filtered_df[filtered_df['sentiment'] == sentiment]['drugName'])
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Calculate  
    freq_dist = FreqDist(words)
    
    # Plot the frequency distribution
    plt.figure(figsize=(6, 4))
    freq_dist.plot(30, title=f"Top 30 words in {sentiment} Sentiment")
    plt.show()

In [None]:
# Create a WordCloud for each sentiment
sentiments = ['Negative', 'Neutral', 'Positive']

for sentiment in sentiments:
    text = " ".join(filtered_df[filtered_df['sentiment'] == sentiment]['drugName'])
    
    wordcloud = WordCloud(width = 800, height = 800,
                          background_color = 'white',
                          min_font_size = 10).generate(text)
    
    plt.figure(figsize = (5, 5))
    plt.imshow(wordcloud)
    plt.title(f"{sentiment} Sentiment")
    plt.axis("off")
    plt.tight_layout(pad=0)
    
    plt.show()