#

In [1]:
import (
    "fmt"
    "io/ioutil"
    "github.com/kniren/gota/dataframe"
    "github.com/kniren/gota/series"
    "strings"
    "strconv"
)

In [58]:
var kitchenReviews = "../datasets/words/processed_acl/kitchen"

Load the data

In [59]:
positives, err := ioutil.ReadFile(kitchenReviews + "/positive.review")
negatives, err2 := ioutil.ReadFile(kitchenReviews + "/negative.review")
if err != nil || err2 != nil {
    fmt.Println("Error(s)", err, err2)
}

The data consists of word(s):frequency pairs separated by spaces:

In [60]:
string(positives)[0:100]

them_it:1 hovering:1 and_occasional:1 cousin_the:2 fictional_baudelaire:1 their_struggles:1 unfortun

Create a struct to hold the word/frequency pair and use gota's LoadStructs() to convert this to a dataframe

In [61]:
pairsPositive := strings.Split(strings.Replace(string(positives), "\n", " ", -1), " ")
pairsNegative := strings.Split(strings.Replace(string(negatives), "\n", " ", -1), " ")

In [62]:
fmt.Println("Positive pairs", len(pairsPositive))
fmt.Println("Negative Pairs", len(pairsPositive))
fmt.Printf("Example pair: `%s`", pairsPositive[0])

Positive pairs 132222
Negative Pairs 132222
Example pair: `them_it:1`

25 <nil>

In [63]:
type Pair struct {
    Phrase string 
    Frequency int
}

//  pairsAndFiltesr returns a slice of Pair, split by : to obtain the phrase and frequency,
//  as well as a map of the phrases that can be used as a lookup table later.
func pairsAndFilters(splitPairs []string) ([]Pair, map[string]bool) {
    var (
        pairs []Pair
        m map[string]bool
    )
    m = make(map[string]bool)
    for _, pair := range splitPairs {
        p := strings.Split(pair, ":")
        phrase := p[0]
        m[phrase] = true
        if len(p) < 2 {
            continue
        }
        freq, err := strconv.Atoi(p[1])
        if err != nil {
            continue
        }
        pairs = append(pairs, Pair{
            Phrase: phrase,
            Frequency: freq,
        })
      }
    return pairs, m
}

//  exclude returns a slice of Pair that does not contain the phrases in the exclusion map
func exclude(pairs []Pair, exclusions map[string]bool) []Pair{
    var ret []Pair 
    for i := range pairs{
        if !exclusions[pairs[i].Phrase]{
            ret = append(ret, pairs[i])
        }
    }
    return ret
}


In [64]:
parsedPositives, posPhrases := pairsAndFilters(pairsPositive)
parsedNegatives, negPhrases := pairsAndFilters(pairsNegative)
parsedPositives = exclude(parsedPositives, negPhrases)
parsedNegatives = exclude(parsedNegatives, posPhrases)

In [65]:
dfPos := dataframe.LoadStructs(parsedPositives)
dfNeg := dataframe.LoadStructs(parsedNegatives)

In [66]:
dfPos = dfPos.Arrange(dataframe.RevSort("Frequency"))
dfNeg = dfNeg.Arrange(dataframe.RevSort("Frequency"))

In [67]:
//most common words in positive reviews
fmt.Println(dfPos)

[46383x2] DataFrame

    Phrase       Frequency
 0: tic-tac-toe  10       
 1: wusthoff     7        
 2: emperor      7        
 3: shot_glasses 6        
 4: pulp         6        
 5: games        6        
 6: sentry       6        
 7: gravel       6        
 8: the_emperor  5        
 9: aebleskivers 5        
    ...          ...      
    <string>     <int>    



373 <nil>

In [68]:
// most common words in negative reviews
fmt.Println(dfNeg)

[45760x2] DataFrame

    Phrase          Frequency
 0: seeds           9        
 1: perculator      7        
 2: probes          7        
 3: cork            7        
 4: coffee_tank     5        
 5: brookstone      5        
 6: convection_oven 5        
 7: black_goo       5        
 8: waring_pro      5        
 9: packs           5        
    ...             ...      
    <string>        <int>    



412 <nil>