<a href="https://colab.research.google.com/github/Sophie-X31/JSC270-Assignment4/blob/main/JSC_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Colab Link: [Link to Collab](https://colab.research.google.com/drive/1ZuIrgB7B7K97vvAgT9wXiWBkouHAC7W2?usp=sharing)

Github Link: https://github.com/Sophie-X31/JSC270-Assignment4

Original Dataset: https://www.kaggle.com/datasets/thoughtvector/customer-support-on-twitter?resource=download

**Group Members**: Terry Tian, Sophie Xu

Preamble: The work for this project was split evenly, where each member contributed to both performing the analytics as well as writing the report and preparing the presentation. The joint effort meant both members participated in all segments of the assignment.

In [None]:
## Import Statements
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import re
import plotly.graph_objs as go
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import regexp_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from plotly.subplots import make_subplots

# Part 1: Sentiment Analysis

A. Initial Observation

The proportion of observations in the training set can be summarized as 

1.   Positive = 18046/41157 $\approx$ 43.85%
2.   Neutral = 15398/41157 $\approx$ 37.41%
3.   Negative = 7713/41157 $\approx$ 18.74%

In [None]:
## Read Dataset
covid_test = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/Corona_NLP_test.csv', header=None)
covid_train = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/Corona_NLP_train.csv', header=None, encoding='latin-1')

## Data Wrangling
def wrangle_df(df):
  df.columns = ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
  df.drop(index=df.index[0], axis=0, inplace=True)
  df['Sentiment'] = df['Sentiment'].replace(['Extremely Positive', 'Positive'], 2)
  df['Sentiment'] = df['Sentiment'].replace('Neutral', 1)
  df['Sentiment'] = df['Sentiment'].replace(['Extremely Negative', 'Negative'], 0) 
wrangle_df(covid_train)
wrangle_df(covid_test)

## Count Proportion
covid_train.info()
covid_train['Sentiment'].value_counts(dropna = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 1 to 41157
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  object
 1   ScreenName     41157 non-null  object
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 1.9+ MB


2    18046
0    15398
1     7713
Name: Sentiment, dtype: int64

B. Tokenize Tweets

In [None]:
## Tokenize
def tokenize(df):
  df['OriginalTweet'] = df['OriginalTweet'].apply(str.strip) # Remove excess whitespace
  nltk.download('punkt')
  df['tokens'] = df['OriginalTweet'].apply(nltk.word_tokenize)

C. Remove Hyperlink

In [None]:
## Remove URL
def remove_url(df):
  tokens_no_url = []
  for row in df['tokens']:
    tokens_no_url.append([re.sub('^http','', w) for w in row])
  df['tokens'] = tokens_no_url

D. Remove Special Characters

In [None]:
## Remove punctuations
def remove_punc(df):
  tokens_no_punct = []
  for row in df['tokens']:
    tokens_no_punct.append([re.sub('[^\w\s]','', w) for w in row])
  df['tokens'] = tokens_no_punct
  df['OriginalTweet'] = df['OriginalTweet'].apply(str.lower) # Convert to lowercase

E. Stem Tokens

In [None]:
## Stemming
def stem(df):  
  stemmer = PorterStemmer()
  stemmed_tokens = []
  for row in df['tokens']:
    stemmed_tokens.append([stemmer.stem(w) for w in row])
  df['tokens'] = stemmed_tokens

## Lemmatization
def lemm(df):
  nltk.download('wordnet') 
  lemmatizer = WordNetLemmatizer()
  lem_tokens = []
  for row in df['tokens']:
    lem_tokens.append([lemmatizer.lemmatize(w) for w in row])
  df['tokens'] = lem_tokens

F. Remove Stop Words

In [None]:
## Remove stopwords
def remove_stopw(df):
  nltk.download('stopwords')
  sw = stopwords.words('english')[:100]
  tokens_no_sw = []
  for row in df['tokens']:
    tokens_no_sw.append([w for w in row if w not in sw])
  df['tokens'] = tokens_no_sw

  no_blanks = [] # Remove blank tokens
  for row in df['tokens']:
    no_blanks.append([w for w in row if w != ''])
  df['tokens'] = no_blanks

G. Vectorize Token Collection with Count Vectorizer

In [None]:
## Put everything together
def tokenize_df(df: pd.DataFrame, lem: bool = False) -> None:
  tokenize(df)
  remove_url(df)
  remove_punc(df)
  if (lem):
    lemm(df)
  else:
    stem(df)
  remove_stopw(df)

train1, test1 = covid_train.copy(), covid_test.copy()
tokenize_df(train1)
x_train, y_train = train1['tokens'].to_numpy(), train1['Sentiment'].to_numpy()
tokenize_df(test1)
x_test, y_test = test1['tokens'].to_numpy(), test1['Sentiment'].to_numpy()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
## Count Vectorizer
def override_func(doc):
  return doc

count_vec = CountVectorizer(
    analyzer='word',
    tokenizer= override_func,
    preprocessor= override_func,
    token_pattern= None,
    max_features = 1000)
counts_train = count_vec.fit_transform(x_train)
counts_test = count_vec.fit_transform(x_test)

H. Fit Naive Bayes Model

In [None]:
x_train = counts_train.toarray()
x_test = counts_test.toarray()
 
## Fit Model
model = MultinomialNB()
model.fit(x_train, y_train)

train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print("Training Accuracy is: " + str(train_acc) + "\n")
print("Testing Accuracy is: " + str(test_acc))


Training Accuracy is: 0.6827271181087057

Testing Accuracy is: 0.41495523959978936


In [None]:
# This function take a while to run
import pprint
services = [0, 1, 2]


def concatenate_tokens(df, service):
  service_df = df[df['Sentiment'] == service]
  word_bank = ' '
  for index in service_df.index:
    lst = service_df['tokens'][index]
    for i in range(len(lst)):
      word_bank = word_bank + lst[i] + ' '
  return word_bank

## Helper: All tokens of a company
def full_tokens(df, service):
  word_bank = concatenate_tokens(df, service)
  return word_bank.split()

# Finding Most Common Words
def most_common_all(df, k):
  all = dict()
  for service in services:
    fd = nltk.FreqDist(full_tokens(df, service))
    words_tuple = fd.most_common(k)
    # fd.tabulate(50)
    all[service] = words_tuple
  return all

most_common = most_common_all(train1, 5)

In [None]:
from pprint import pprint
pprint(most_common)


{0: [('s', 10117),
     ('coronaviru', 6737),
     ('covid19', 4610),
     ('price', 4347),
     ('food', 3639)],
 1: [('s', 6287),
     ('coronaviru', 3812),
     ('covid19', 2567),
     ('store', 1588),
     ('supermarket', 1442)],
 2: [('s', 12911),
     ('coronaviru', 7512),
     ('covid19', 5684),
     ('store', 3918),
     ('thi', 3783)]}


I. Fit ROC Curve

It would not be appropriate to fit a ROC curve in this scenario because this dataset is imbalanced. Moreover, in this situation of having three labels, we can only plot ROC curves for one class versus the rest, which gives limited information.

J. Vectorize Token Collection with TF_IDF

In [None]:
## TF-IDF Vectorize
tfidf = TfidfTransformer()
x_tf_train = tfidf.fit_transform(counts_train).toarray()
x_tf_test = tfidf.fit_transform(counts_test).toarray()

## Fit Model
model = MultinomialNB()
model.fit(x_tf_train, y_train)

train_pred = model.predict(x_tf_train)
test_pred = model.predict(x_tf_test)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print("Training Accuracy is: " + str(train_acc) + "\n")
print("Testing Accuracy is: " + str(test_acc))

Training Accuracy is: 0.6667395582768423

Testing Accuracy is: 0.432596103212217


K. Use Lemmatization

In [None]:
## Tokenize Tweets
train2, test2 = covid_train.copy(), covid_test.copy()
tokenize_df(train2, True)
x_train2, y_train2 = train2['tokens'].to_numpy(), train2['Sentiment'].to_numpy()
tokenize_df(test2, True)
x_test2, y_test2 = test2['tokens'].to_numpy(), test2['Sentiment'].to_numpy()

## Vectorize with TF-IDF
counts_train2 = count_vec.fit_transform(x_train2)
counts_test2 = count_vec.fit_transform(x_test2)
x_tf_train2 = tfidf.fit_transform(counts_train2).toarray()
x_tf_test2 = tfidf.fit_transform(counts_test2).toarray()

## Fit Model
model = MultinomialNB()
model.fit(x_tf_train2, y_train)

train_pred = model.predict(x_tf_train2)
test_pred = model.predict(x_tf_test2)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print("Training Accuracy is: " + str(train_acc) + "\n")
print("Testing Accuracy is: " + str(test_acc))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training Accuracy is: 0.6405714702237773

Testing Accuracy is: 0.43417588204318064


BONUS: 

The Naive Bayes model is generative because, unlike a logistic regression where we can only predict whether an input has the label, the Bayes model allows us to predict the label for multiple classes.

# Part 2: NLP using Twitter API

Step 1: Data Wrangling

In [None]:
## Concatenate DataFrame
apple = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/AppleSupport.csv', header=None, encoding='latin-1')
air = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/AirAsiaSupport.csv', header=None, encoding='latin-1')
rbc = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/AskRBC.csv', header=None, encoding='latin-1')
mcdon = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/McDonalds.csv', header=None, encoding='latin-1')
nike = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/NikeSupport.csv', header=None, encoding='latin-1')
spotify = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/SpotifyCares.csv', header=None, encoding='latin-1')
walmart = pd.read_csv('https://raw.githubusercontent.com/Sophie-X31/JSC270-Assignment4/main/Walmart.csv', header=None, encoding='latin-1')

def drop_row(df):
  df.columns = ['tweet_id', 'author_id', 'inbound', 'created_at', 'text', 'response_tweet_id', 'in_response_to_tweet_id']
  df.drop(index=df.index[0], axis=0, inplace=True)
frames = [apple, air, rbc, mcdon, nike, spotify, walmart]
for name in frames:
  drop_row(name)
df = pd.concat(frames)

In [None]:
## Overview of Dataset
print(df.info())
df.head()

Step 2: Exploratory Data Analysis

In [None]:
## Tokenize
def tokenize2(df):
  df['text'] = df['text'].apply(str.strip)
  df['text'] = df['text'].apply(str.lower)
  nltk.download('punkt')
  df['tokens'] = df['text'].apply(nltk.word_tokenize)

def remove_punc2(df):
  tokens_no_punct = []
  for row in df['tokens']:
    tokens_no_punct.append([re.sub('[^\w\s]','', w) for w in row])
  df['tokens'] = tokens_no_punct

def tokenize_df2(df) -> None:
  tokenize2(df)
  remove_url(df)
  remove_punc2(df)
  remove_stopw(df)
  lemm(df)

## Vectorize
tokenize_df2(df)
x, y = df['tokens'].to_numpy(), df['author_id'].to_numpy()
count_x = count_vec.fit_transform(x)
X = TfidfTransformer().fit_transform(count_x).toarray()

In [None]:
## WordCloud Image
def concatenate_tokens(df, service):
  service_df = df[df['author_id'] == service]
  word_bank = ' '
  for index in service_df.index:
    lst = service_df['tokens'][index]
    for i in range(len(lst)):
      word_bank = word_bank + lst[i] + ' '
  return word_bank

def plot_WordCloud(df, service):
  wordcloud = WordCloud(width = 800, height = 800,
                        background_color ='white',
                        stopwords = set(STOPWORDS),
                        min_font_size = 10).generate(concatenate_tokens(df, service))                     
  plt.figure(figsize = (8, 8), facecolor = None)
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.tight_layout(pad = 0)
  plt.show()

services = ['AppleSupport', 'AirAsiaSupport', 'AskRBC', 'McDonalds', 'NikeSupport', 'SpotifyCares', 'Walmart']
for s in services:
  plot_WordCloud(df, s)

In [None]:
## Helper: All tokens of a company
def full_tokens(df, service):
  word_bank = concatenate_tokens(df, service)
  return word_bank.split()

## Frequency Distribution for Top 50 Most Common Words of Each Company
def most_common_all(df, k):
  all = dict()
  for service in services:
    fd = nltk.FreqDist(full_tokens(df, service))
    words_tuple = fd.most_common(k)
    # fd.tabulate(50)
    all[service] = words_tuple
  return all
most_common = most_common_all(df, 50)

## Helper: Only words
def set_words_all(most_common):
  all = dict()
  for service in services:
    coll = set()
    words = most_common[service]
    for i in range(len(words)):
      coll.add(words[i][0])
    all[service] = coll
  return all
set_most_common = set_words_all(most_common)

## Helper: List of all words without duplicates
def union_list(set_most_common):
  union_set = []
  for service in services:
    words_set = set_most_common[service]
    union_set.extend(words_set)
  return union_set
union_lst = union_list(set_most_common)

## Condense into pd DataFrame
most_common_words = pd.DataFrame(columns = ['company', 'words', 'frequency'])
for service in services:
  word_list = most_common[service]
  for i in range(len(word_list)):
    most_common_words = most_common_words.append({'company' : service, 'words' : word_list[i][0], 'frequency' : word_list[i][1]}, ignore_index=True)

In [None]:
## Find duplicated common words
def find_duplicates(union_lst, k):
  duplicates = []
  fd = nltk.FreqDist(union_lst)
  distr = fd.most_common(k)
  for i in range(len(distr)):
    if distr[i][1] > 1:
      duplicates.append(distr[i][0])
  return (distr, duplicates)
distr = find_duplicates(union_lst, 100)[0]
duplicates = find_duplicates(union_lst, 100)[1]

## Helper: Check highest frequency for the given word
def compare_freq(word, freq, service, most_common_words):
  other_companies = services.copy()
  other_companies.remove(service)
  for ser in other_companies:
    word_freq_lst = most_common_words[most_common_words['company'] == ser]
    for index in range(len(word_freq_lst.index)):
      if (word == word_freq_lst['words'].iloc[index]) and (freq < word_freq_lst['frequency'].iloc[index]):
        return False
  return True

## Find the top k unique most frequent words
def unique(most_common_words, service, k):
  unique = pd.DataFrame(columns = ['words', 'frequency'])
  word_freq_lst = most_common_words[most_common_words['company'] == service]
  count, index = 0, 0
  while count < k:
    word = word_freq_lst['words'].iloc[index]
    freq = word_freq_lst['frequency'].iloc[index]
    if word in duplicates:
      if compare_freq(word, freq, service, most_common_words):
        unique = unique.append({'words' : word, 'frequency' : freq}, ignore_index=True)
        count = count + 1
    else:
      unique = unique.append({'words' : word, 'frequency' : freq}, ignore_index=True)
      count = count + 1
    index = index + 1
  return unique
unique(most_common_words, services[1], 10) # Index 1 for AirAsiaSupport

## Find the top k most commonly used words
def top_duplicated_common_words(distr, k):
  dist = pd.DataFrame(columns = ['words', 'frequency'])
  count, index = 0, 0
  while count < k:
    group = distr[index]
    word = group[0]
    freq = group[1]
    if len(word) > 1:
      dist = dist.append({'words' : word, 'frequency' : freq}, ignore_index=True)
      count = count + 1
    index = index + 1
  return dist
top_duplicated_common_words(distr, 10)

In [None]:
## Sentiment Assignment
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
df['sentiment'] = [sia.polarity_scores(txt).get('compound') for txt in df['text']]

## Condense into pd DataFrame
avg_sentiment = pd.DataFrame(columns = ['company', 'average_polarity_score'])
for i in range(len(services)):
  avg = df[df['author_id'] == services[i]]['sentiment'].mean()
  avg_sentiment = avg_sentiment.append({'company' : services[i], 'average_polarity_score' : avg}, ignore_index=True)
avg_sentiment

## Graph Results
ax = sns.barplot(x="company", y="average_polarity_score", data=avg_sentiment, palette='crest')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
for i in ax.containers:
    ax.bar_label(i,)

Step 3: Development and Evaluation of Model

In [None]:
## Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Model (Testing)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test,y_pred))

## Model (Training)
model2 = MultinomialNB()
model2.fit(X_train, y_train)
y_pred2 = model.predict(X_train)
print(accuracy_score(y_train,y_pred2))

In [None]:
# Classification report
cr_dict = classification_report(y_test, y_pred, output_dict=True)
# Define the columns of the table
columns = ['precision', 'recall', 'f1-score', 'support']

# Create a figure and axes object
fig, ax = plt.subplots()

# Remove the axes spines and ticks
ax.axis('off')

# Add a title
ax.set_title('Classification Report')

# Define the table format
table_data = []

for label, scores in cr_dict.items():
    if label == 'accuracy':
        continue
    row_data = [label] + [scores[col] for col in columns]
    table_data.append(row_data)

# Add the table to the plot
columns = ['Company', 'precision', 'recall', 'f1-score', 'support']
table = ax.table(cellText=table_data, colLabels=columns, loc='center')

# Set the color of the column labels to red
for j in range(len(columns)):
    cell = table.get_celld()[(0, j)]
    cell.set_facecolor('#ede3daff')

# Adjust te figure size and padding
fig.set_size_inches(8, 4)  # Change the figure size to 8x4 inches
table.set_fontsize(14)  # Set the font size of the table
table.scale(1.5, 1.5)  # Scale the table by 1.5x in both directions
ax.axis('off')  # Turn off the axis

# Show the plot
plt.show()

In [None]:
## Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(xticks_rotation=30)
plt.show()

In [None]:
## ROC and AUC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit a naive Bayes classifier on the training data
model = MultinomialNB()
model.fit(X_train, y_train)

# Binarize the labels
y_test_bin = label_binarize(y_test, classes=np.unique(y))

# Compute the predicted scores for each class
y_score = model.predict_proba(X_test)

# Compute the ROC curve and AUC score for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(np.unique(y))):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Set the number of rows and columns for the plot grid
num_rows = int(np.ceil(len(np.unique(y))/3))
num_cols = 3

# Set the figure size
plt.figure(figsize=(15, 9))

# Plot the ROC curve for each class on a separate subplot
lst = np.unique(y)

for i in range(len(np.unique(y))):
    c = lst[i]
    plt.subplot(num_rows, num_cols, i+1)
    plt.plot(fpr[i], tpr[i], color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve for class '+ c)
    plt.legend(loc="lower right")
    
# Adjust the layout of the subplots
plt.tight_layout()

# Show the plot
plt.show()