# Introduction to Text Analytics

### Data Science 350
### Stephen Elston

## Introduction 

This notebook contains a tutorial introduction to 

In [1]:
## Read the tweet data set
tweets = read.csv('Binary Classification_ Twitter sentiment analysis.csv', 
                   header = TRUE, stringsAsFactors = FALSE)
colnames(tweets) <- c("sentiment", "tweets") # Set the column names
tweets[, 'sentiment'] = ifelse(tweets$sentiment == 4, 1, 0)  # set sentiment to {0,1}
head(tweets) # Have a look at the data frame

sentiment,tweets
1,"@elephantbird Hey dear, Happy Friday to You Already had your rice's bowl for lunch ?"
1,Ughhh layin downnnn Waiting for zeina to cook breakfast
0,"@greeniebach I reckon he'll play, even if he's not 100%...but i know nothing!! ;) It won't be the same without him."
0,@vaLewee I know! Saw it on the news!
0,very sad that http://www.fabchannel.com/ has closed down. One of the few web services that I've used for over 5 years
0,@Fearnecotton who sings 'I Remember'? i alwaysss hear it on Radio 1 but never catch the artist


In [2]:
## Create a tm text corpus from the tweets
library(tm)  ## tm package for text mining
tweet.corpus <- Corpus(VectorSource(tweets['tweets']))
class(tweet.corpus) # What is the class of the corpus

ERROR: Error in library(tm): there is no package called 'tm'


In [3]:
## Normalize tweets text
tweet.corpus <- tm_map(tweet.corpus, content_transformer(removeNumbers))
tweet.corpus <- tm_map(tweet.corpus, content_transformer(removePunctuation))
tweet.corpus <- tm_map(tweet.corpus, content_transformer(stripWhitespace))
tweet.corpus <- tm_map(tweet.corpus, content_transformer(tolower))

ERROR: Error in eval(expr, envir, enclos): could not find function "tm_map"


In [4]:
## ----- Convert the corpus to a term document matrix
to.tdm = function(corpus, sparse = 0.998){
  require(tm)
  ## Compute a term-document matrix and then 
  require(slam) # Sparse matrix package
  tdm <- TermDocumentMatrix(corpus, control = list(stopwords = FALSE))
  tdm <- removeSparseTerms(tdm, sparse)
  tdm
}
tdm = to.tdm(tweet.corpus) # Create a term document matrix
str(tdm) # Look at sparse tdm
findFreqTerms(tdm, 2000) # Words that occur at least 2000 times


Loading required package: tm
"there is no package called 'tm'"Loading required package: slam
"there is no package called 'slam'"

ERROR: Error in to.tdm(tweet.corpus): could not find function "TermDocumentMatrix"


In [5]:
## Compute the word fequency from the tdm
to.wf = function(tdm){
  ## compute the word frequencies.
  require(slam)
  freq <- row_sums(tdm, na.rm = T)   
  ## Sort the word frequency and build a dataframe
  ## including the cumulative frequecy of the words.
  freq <- sort(freq, decreasing = TRUE)
  word.freq <- data.frame(word = factor(names(freq), levels = names(freq)), 
                          frequency = freq)
  word.freq['Cumulative'] <- cumsum(word.freq['frequency'])/sum(word.freq$frequency)
  word.freq
}
wf = to.wf(tdm)
head(wf, n = 10)


Loading required package: slam
"there is no package called 'slam'"

ERROR: Error in to.wf(tdm): could not find function "row_sums"


In [6]:
## Make a bar chart of the word frequency
word.bar = function(wf, num = 50){
  require(ggplot2)
  ggplot(wf[1:num,], aes(word, frequency)) +
    geom_bar(stat = 'identity') +
    ggtitle('Frequency of common words') +
    ylab('Frequency') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
word.bar(wf)


Loading required package: ggplot2
"package 'ggplot2' was built under R version 3.3.2"

ERROR: Error in ggplot(wf[1:num, ], aes(word, frequency)): object 'wf' not found


In [7]:
## Make cumulative distribution plots of the most frequent words
word.cdf = function(wf, num = 50){
  require(ggplot2)
  ggplot(wf[1:num,], aes(word, Cumulative)) +
    geom_bar(stat = 'identity') +
    ggtitle('Cumulative fraction of common words') +
    ylab('Cumulative frequency') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
word.cdf(wf)


ERROR: Error in ggplot(wf[1:num, ], aes(word, Cumulative)): object 'wf' not found


In [8]:
## ----------------------------------------------------------
## -----------Stop words -------------------------------------
##
## Load stop words from a file and ensure they are 
stopWords = read.csv('stopwords.csv', header = TRUE, stringsAsFactors = FALSE)
stopWords = unique(stopWords) # Ensure the list is unique
stopWords[1:100,] # Look at the first 100 stop words


In [9]:
## Remove the stop words from the corpus
tweet.corpus <- tm_map(tweet.corpus, removeWords, stopWords[, 'words'])


ERROR: Error in eval(expr, envir, enclos): could not find function "tm_map"


In [10]:
## Remove the stop words from the corpus
tweet.corpus <- tm_map(tweet.corpus, removeWords, stopWords[, 'words'])

ERROR: Error in eval(expr, envir, enclos): could not find function "tm_map"


In [11]:
## View the results
tdm = to.tdm(tweet.corpus) # Create a term document matrix
findFreqTerms(tdm, 2000) # Words that occur at least 2000 times
wf = to.wf(tdm)  # Compute word fequency
head(wf, n = 10)  # Look at the most common words
word.bar(wf) # Plot word frequency
word.cdf(wf) # Plot cdf

Loading required package: tm
"there is no package called 'tm'"Loading required package: slam
"there is no package called 'slam'"

ERROR: Error in to.tdm(tweet.corpus): could not find function "TermDocumentMatrix"


In [12]:
## --------------------------------------------------
## ------------ Stem the words ----------------------
##
## Use the porter stemmer in Snowball package
##
require(SnowballC) ## For Porter stemming words
tweet.corpus <- tm_map(tweet.corpus, stemDocument)

Loading required package: SnowballC
"there is no package called 'SnowballC'"

ERROR: Error in eval(expr, envir, enclos): could not find function "tm_map"


In [13]:
## View the results
tdm = to.tdm(tweet.corpus, sparse = 0.99) # Create a term document matrix
findFreqTerms(tdm, 2000) # Words that occur at least 2000 times
wf = to.wf(tdm)  # Compute word fequency
head(wf, n = 10)  # Look at the most common words
word.bar(wf) # Plot word frequency
word.cdf(wf) # Plot cdf

Loading required package: tm
"there is no package called 'tm'"Loading required package: slam
"there is no package called 'slam'"

ERROR: Error in to.tdm(tweet.corpus, sparse = 0.99): could not find function "TermDocumentMatrix"


In [14]:
## View the results
tdm = to.tdm(tweet.corpus, sparse = 0.99) # Create a term document matrix
findFreqTerms(tdm, 2000) # Words that occur at least 2000 times
wf = to.wf(tdm)  # Compute word fequency
head(wf, n = 10)  # Look at the most common words
word.bar(wf) # Plot word frequency
word.cdf(wf) # Plot cdf

Loading required package: tm
"there is no package called 'tm'"Loading required package: slam
"there is no package called 'slam'"

ERROR: Error in to.tdm(tweet.corpus, sparse = 0.99): could not find function "TermDocumentMatrix"


In [15]:
## Create the a container for the tdm and label
tweet.cont = create_container(tdm.tools,tweets$sentiment, trainSize = 1:120000, virgin=TRUE)

ERROR: Error in eval(expr, envir, enclos): could not find function "create_container"


In [16]:
## Compute a logistic regresson model for sentiment classification
tweet.glmnet <- train_model(tweet.cont, "GLMNET")

ERROR: Error in eval(expr, envir, enclos): could not find function "train_model"


In [17]:
## Test classification
tweet.class = classify_model(tweet.cont, tweet.glmnet)
tweet.metrics = create_analytics(tweet.cont, tweet.class)

ERROR: Error in eval(expr, envir, enclos): could not find function "classify_model"


In [18]:
## Examine some raw metrics
tweet.metrics@label_summary
cbind(head(tweet.metrics.TfIdf@document_summary, n = 10), head(tweets$sentiment, n = 10))
cbind(head(tweet.metrics@document_summary, n = 10), head(tweets$sentiment, n = 10))

ERROR: Error in eval(expr, envir, enclos): object 'tweet.metrics' not found


In [19]:
create_precisionRecallSummary(tweet.cont, tweet.class)

ERROR: Error in eval(expr, envir, enclos): could not find function "create_precisionRecallSummary"


In [20]:
#----------------------------------------------
## Compute TFIdf weighted tdm
## Compute a tdm
tdm.tools2 = create_matrix(tweet.frame$tweets, language = "english", removeNumbers = FALSE,
                          stemWords = FALSE, removeSparseTerms = .998, 
                          removeStopwords = FALSE, stripWhitespace = FALSE,
                          toLower = FALSE, weighting = tm::weightTfIdf)

ERROR: Error in eval(expr, envir, enclos): could not find function "create_matrix"


In [21]:
## Create the a container for the TfIdf weighted tdm and label
tweet.cont = create_container(tdm.tools2,tweets$sentiment, trainSize = 1:120000, virgin=TRUE)

ERROR: Error in eval(expr, envir, enclos): could not find function "create_container"


In [22]:
## Compute a logistic regresson model for sentiment classification
tweet.glmnet.TfIdf <- train_model(tweet.cont,"GLMNET")

ERROR: Error in eval(expr, envir, enclos): could not find function "train_model"


In [23]:
## Test classification
tweet.class.TfIdf = classify_model(tweet.cont, tweet.glmnet.TfIdf)
tweet.metrics.TfIdf = create_analytics(tweet.cont, tweet.class.TfIdf)

ERROR: Error in eval(expr, envir, enclos): could not find function "classify_model"


In [24]:
## Examine some raw metrics
tweet.metrics.TfIdf@label_summary
results = head(tweet.metrics.TfIdf@document_summary, n = 20)
results

ERROR: Error in eval(expr, envir, enclos): object 'tweet.metrics.TfIdf' not found


In [25]:
## Look at the confusion matrix and compare to the unweighte tdf model
create_precisionRecallSummary(tweet.cont, tweet.class.TfIdf)
create_precisionRecallSummary(tweet.cont, tweet.class)

ERROR: Error in eval(expr, envir, enclos): could not find function "create_precisionRecallSummary"


In [26]:
## ---------------------------------------------------
## ---------- Exploring Term Document Matrix
##
## Load the data set as a vector corpus of 20 documents
library(tm)
data(crude)
writeLines(as.character(crude[[1]]))

ERROR: Error in library(tm): there is no package called 'tm'


In [27]:
## Compute the term document matrix
crude.tdm = TermDocumentMatrix(crude, control = list(removePunctuation = TRUE,
                                                     tolower = TRUE,
                                                     removePunctuation = TRUE,
                                                     removeNumbers = TRUE,
                                                     stopwords = TRUE,
                                                     stemming = TRUE))

ERROR: Error in eval(expr, envir, enclos): could not find function "TermDocumentMatrix"


In [28]:
## Have a look at the tdm 
inspect(crude.tdm[202:205, 1:5])

ERROR: Error in eval(expr, envir, enclos): could not find function "inspect"


In [29]:
## Which terms occur 10 times or more?
crudeTDMHighFreq <- findFreqTerms(crude.tdm, 10, Inf)
crudeTDMHighFreq

ERROR: Error in eval(expr, envir, enclos): could not find function "findFreqTerms"


In [30]:
# Do these terms show up in the first 5 documents?
inspect(crude.tdm[crudeTDMHighFreq, 1:5]) 

ERROR: Error in eval(expr, envir, enclos): could not find function "inspect"


In [31]:
## -------------------------------------------------------
## Apply a topic model to the news articles\
##
##load topic models library
library(topicmodels)

#Set parameters for Gibbs sampling
burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE

#Number of topics
k <- 5

## Compute the LDA model
crude.dtm = DocumentTermMatrix(crude, control = list(removePunctuation = TRUE,
                                                     stopwords = TRUE))
crude.dtm  ## Check the drm

ERROR: Error in library(topicmodels): there is no package called 'topicmodels'


In [32]:
ldaOut = LDA(crude.dtm, k, method= "Gibbs", control = list(nstart = nstart, seed = seed, best = best, burnin = burnin, iter = iter, thin=thin))

ERROR: Error in eval(expr, envir, enclos): could not find function "LDA"


In [33]:
## Examine the topics
ldaOut.topics <- as.matrix(topics(ldaOut))
head(ldaOut.topics)

ERROR: Error in as.matrix(topics(ldaOut)): could not find function "topics"


In [34]:
## And the terms
ldaOut.terms <- as.matrix(terms(ldaOut,6))
head(ldaOut.terms)

ERROR: Error in terms(ldaOut, 6): object 'ldaOut' not found


In [35]:
#probabilities associated with each topic assignment
topicProbabilities <- as.data.frame(ldaOut@gamma)
head(topicProbabilities)

ERROR: Error in as.data.frame(ldaOut@gamma): object 'ldaOut' not found


In [36]:
#Find relative importance of top 2 topics
topic1ToTopic2 <- lapply(1:nrow(crude.dtm),function(x)
  sort(topicProbabilities[x,])[k]/sort(topicProbabilities[x,])[k-1])
unlist(topic1ToTopic2)

ERROR: Error in nrow(crude.dtm): object 'crude.dtm' not found


In [37]:
#Find relative importance of second and third most important topics
topic2ToTopic3 <- lapply(1:nrow(crude.dtm),function(x)
  sort(topicProbabilities[x,])[k-1]/sort(topicProbabilities[x,])[k-2])
topic2ToTopic3

ERROR: Error in nrow(crude.dtm): object 'crude.dtm' not found
