In [6]:
library(tidyverse)
library(tidytext)

── [1mAttaching core tidyverse packages[22m ─────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors


### Get and Clean Data

In [48]:
# Size in mb
file.size('data/en_US.news.txt')/1000/1000

[1] 205.8119

In [120]:
# Blogs
blogs <- read_lines('data/en_US.blogs.txt') |> tibble(text = _)
blogs[1:3,]

[38;5;246m# A tibble: 3 × 1[39m
  text                                                                                                             
  [3m[38;5;246m<chr>[39m[23m                                                                                                            
[38;5;250m1[39m In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”.                     
[38;5;250m2[39m We love you Mr. Brown.                                                                                           
[38;5;250m3[39m Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been …

In [79]:
# News
news <- read_lines('data/en_US.news.txt') |> tibble(text = _)
news[1:3,]

[38;5;246m# A tibble: 3 × 1[39m
  text                                                                          
  [3m[38;5;246m<chr>[39m[23m                                                                         
[38;5;250m1[39m He wasn't home alone, apparently.                                             
[38;5;250m2[39m The St. Louis plant had to close. It would die of old age. Workers had been m…
[38;5;250m3[39m WSU's plans quickly became a hot topic on local online sites. Though most peo…

In [80]:
# Tweets
# read_lines flags a warning, but there are no problems reported
tweets <- read_lines('data/en_US.twitter.txt') |> tibble(text = _)
tweets[1:3,]

[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat) 


[38;5;246m# A tibble: 3 × 1[39m
  text                                                                          
  [3m[38;5;246m<chr>[39m[23m                                                                         
[38;5;250m1[39m How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to …
[38;5;250m2[39m When you meet someone special... you'll know. Your heart will beat more rapid…
[38;5;250m3[39m they've decided its more fun if I don't.                                      

In [81]:
tweets |> dim()

[1] 2360148       1

### Exploratory Data Analysis

In [121]:
df <- tibble(
  type = c('blogs','news','tweets'),
  data = list(blogs, news, tweets)
)
df

[38;5;246m# A tibble: 3 × 2[39m
  type   data                    
  [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<list>[39m[23m                  
[38;5;250m1[39m blogs  [38;5;246m<tibble [899,288 × 1]>[39m  
[38;5;250m2[39m news   [38;5;246m<tibble [1,010,242 × 1]>[39m
[38;5;250m3[39m tweets [38;5;246m<tibble [2,360,148 × 1]>[39m

In [122]:
# Longest document in each corpora
df |> rowwise() |>
  mutate(maxLength = max(map_int(data$text, str_length)))

[38;5;246m# A tibble: 3 × 3[39m
[38;5;246m# Rowwise: [39m
  type   data                     maxLength
  [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<list>[39m[23m                       [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m blogs  [38;5;246m<tibble [899,288 × 1]>[39m       [4m4[24m[4m0[24m833
[38;5;250m2[39m news   [38;5;246m<tibble [1,010,242 × 1]>[39m     [4m1[24m[4m1[24m384
[38;5;250m3[39m tweets [38;5;246m<tibble [2,360,148 × 1]>[39m       140

In [62]:
df[1,2]

[38;5;246m# A tibble: 1 × 1[39m
  data                    
  [3m[38;5;246m<list>[39m[23m                  
[38;5;250m1[39m [38;5;246m<tibble [1,010,242 × 1]>[39m