-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscoreEmoticons.R
133 lines (91 loc) · 3.94 KB
/
scoreEmoticons.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
setwd('C:/etc/Projects/Data/_Ongoing/EmoticonSentiment/')
#####################################################
##Part 0: Acquire Bundle of tweets with emoticons####
tweets <- readLines("data/tweetswithemoticons.txt")
#a vector of characterstrings, each element correpsonding to the text part of exactly one tweet
#######################################
######## Part 1: Score Tweets #########
sentiment.file <- "data/AFINN-111.txt"
#Initialize sentiment dictionary
df.sentiments <- read.table(sentiment.file,header=F,sep="\t",quote="",col.names=c("term","score"))
df.sentiments$term <- gsub("[^[:alnum:]]", " ",df.sentiments$term)
ScoreTerm <- function(term){
df.sentiments[match(term,df.sentiments[,"term"]),"score"]
}
ScoreText <- function(text){
text <- tolower(gsub("[^[:alnum:]]", " ",text))
text <- do.call(c,strsplit(text," "))
text <- text[text!=""]
length(text)
scores <- ScoreTerm(text)
scores[is.na(scores)] <- 0
sum(scores)
}
tweet.scores <- sapply(tweets,ScoreText)
#########################################
#### Part 2: Extract the Emoticon(s) ####
emoticon.list <- c( "\\:\\-\\)","\\:\\)","\\=\\)",
"\\:\\-D","\\:D","8\\-D","8D","x\\-D","xD","X\\-D","XD",
"\\:\\-\\(","\\:\\(",
"\\:\\-\\|","\\:\\|",
"\\:\\'\\-\\(","\\:\\'\\(\\)",
"\\:\\'\\-\\)","\\:\\'\\)",
"\\:\\-o","\\:\\-O","\\:o","\\:O",
"o_O","o\\.O","O_o","O\\.o",
"\\:\\*","\\;\\-\\)","\\;\\)",
"\\%\\-\\)",
"\\<3","\\<\\/3" )
## oh, oh, oh! Take care that none is a substring of another.
## You can't have ":)" and ":))"; or ">:-)" and ":-)", etc.
## The second kind may actually be a concern
## If you do want them, you'll need to change the matching code somewhat to make sure it only matches the longest one or something
## However, it's perfectly okay to have multiple emoticons in the same text. Like "Oh my! :-) :-O"
FindMatches <- function(emoticon,tweets){
#This is a simple function, but I'm separating this out because I might want to replace it with a more complex logic that takes care of substrings, etc.
grep(emoticon,tweets)
#This will return a vector of elements from <tweets> which contain <emoticon>
}
emoticon.score.distributions <-
sapply(emoticon.list, function(e){
containing.indices <- FindMatches(e,tweets)
score.dist <- tweet.scores[containing.indices]
as.numeric(score.dist)
})
###############################################
##### Part 3: Post-processing The results #####
## 1: Mean,sd
emoticon.sentiment.means <- sapply(emoticon.score.distributions,mean)
emoticon.sentiment.sds <- sapply(emoticon.score.distributions,sd)
##2: Plot
clean.scores <- emoticon.score.distributions[sapply(emoticon.score.distributions,length)!=0]
emoticon.tags <-do.call(c,
sapply(1:length(clean.scores), function(x)
rep(names(clean.scores[x]),length(clean.scores[[x]])) )
)
allscores <- do.call(c, clean.scores)
df.scores<-data.frame(score=as.numeric(allscores),emoticon=emoticon.tags)
row.names(df.scores)<- 1:nrow(df.scores)
#The above is a pretty roundabout way to achieve this. If you can think of something better, let me know
library(ggplot2)
p <- ggplot(data=df.scores,aes(
x=score,
y=reorder(emoticon,score,mean),
color=emoticon,
group=emoticon))
p.nonzero <- ggplot(data=df.scores[df.scores$score!=0,],aes(
x=score,y=reorder(emoticon,score,mean),
color=emoticon,
group=emoticon))
p + geom_point(size=2,alpha=0.6, position = position_jitter(height = 0.2)) +
geom_errorbarh(stat = "vline", xintercept = "mean",
height=0.6, size=1,
aes(xmax=..x..,xmin=..x..),color="black") +
theme(legend.position="none") +
xlab("Score") + ylab("Emoticon") +
scale_y_discrete(labels=function(s) { gsub('\\\\','',s) }) #To remove the \\s from the emoticons
## 3: Oddities
which(grepl("\\:\\-\\(",tweets) & tweet.scores>5)
####################################################
#To Do (2013-09-09):
## Decide how to handle the substring cases, if any
####################################################