In [1]:
!pip3 install textblob




In [2]:
from textblob import TextBlob

analysis = TextBlob("The book was very good!")

In [3]:
print(dir(analysis))

['__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cmpkey', '_compare', '_create_sentence_objects', '_strkey', 'analyzer', 'classifier', 'classify', 'correct', 'detect_language', 'ends_with', 'endswith', 'find', 'format', 'index', 'join', 'json', 'lower', 'ngrams', 'noun_phrases', 'np_counts', 'np_extractor', 'parse', 'parser', 'polarity', 'pos_tagger', 'pos_tags', 'raw', 'raw_sentences', 'replace', 'rfind', 'rindex', 'sentences', 'sentiment', 'sentiment_assessments', 'serialized', 'split', 'starts_with', 'startswith', 'string', 'strip', 'stripped', 'subjectivity', 'tags', 'title', 'to_json', 'tokenize', 'tokenizer', 'tokens', 'tra

In [4]:
print(analysis.translate(to='es'))

¡El libro fue muy bueno!


#### These are parts of speech. Since TextBlob is built on top of NLTK, the part of speech tags are the same.

In [5]:
print(analysis.tags)

[('The', 'DT'), ('book', 'NN'), ('was', 'VBD'), ('very', 'RB'), ('good', 'JJ')]


In [6]:
print(analysis.sentiment)

Sentiment(polarity=1.0, subjectivity=0.7800000000000001)


In [7]:
from textblob import TextBlob

pos_count = 0
pos_correct = 0

with open("positive.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity > 0:
            pos_correct += 1
        pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity <= 0:
            neg_correct += 1
        neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 71.11777944486121% via 5332 samples
Negative accuracy = 55.8702175543886% via 5332 samples


### It looks like our positive accuracy is decent, but the negative sentiment accuracy isn't all that good. It could be the case that this classifier is biased across the board, so our "zero" could be moved a bit, let's say 0.2, so we change: 
#### if analysis.sentiment.polarity > 0.2: pos_correct += 1
#### if analysis.sentiment.polarity < 0.2: neg_correct += 1

In [8]:
from textblob import TextBlob

pos_count = 0
pos_correct = 0

with open("positive.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity > 0.2:
            pos_correct += 1
        pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity <= 0.2:
            neg_correct += 1
        neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 46.17404351087772% via 5332 samples
Negative accuracy = 80.1012753188297% via 5332 samples


### Hmm, well that's better than random I guess, but not something we want to see. What if we play with the subjectivity though? Maybe we can only look at reviews that we feel are more objective?

In [9]:
from textblob import TextBlob

pos_count = 0
pos_correct = 0

with open("positive.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)

        if analysis.sentiment.subjectivity < 0.9:
            if analysis.sentiment.polarity > 0:
                pos_correct += 1
            pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.subjectivity < 0.9:
            if analysis.sentiment.polarity <= 0:
                neg_correct += 1
            neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 70.46771523178808% via 4832 samples
Negative accuracy = 55.10121457489878% via 4940 samples


#### Interesting, I must not understand subjectivity. Interestingly, if we require subjectivity to be BELOW 0.1, I get: 
#### Positive accuracy = 2.914389799635701% via 549 samples Negative accuracy = 98.1159420289855% via 690 samples [Finished in 6.5s]

# As we are not Concerned About Neutral Sentiments we can skip the center part and can ignore polarity between -0.5 < polarity < 0.5.

### But Doing So our number of sample we are testing will get reduced!

In [10]:
from textblob import TextBlob

pos_count = 0
pos_correct = 0

with open("positive.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)

        if analysis.sentiment.polarity >= 0.5:
            if analysis.sentiment.polarity > 0:
                pos_correct += 1
            pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity <= -0.5:
            if analysis.sentiment.polarity <= 0:
                neg_correct += 1
            neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 100.0% via 766 samples
Negative accuracy = 100.0% via 282 samples


#### Hmm, okay so we lost a lot of samples, but got perfect accuracy. What if we change this threshold just a bit, let's go with 0.1 and -0.1 instead.

#### Positive accuracy = 100.0% via 3310 samples
#### Negative accuracy = 100.0% via 1499 samples [Finished in 6.5s]
### Now Lets try with 0.00000001!

In [11]:
from textblob import TextBlob

pos_count = 0
pos_correct = 0

with open("positive.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)

        if analysis.sentiment.polarity >= 0.000000000001:
            if analysis.sentiment.polarity > 0:
                pos_correct += 1
            pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative.txt","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity <= -0.000000000001:
            if analysis.sentiment.polarity <= 0:
                neg_correct += 1
            neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 100.0% via 3790 samples
Negative accuracy = 100.0% via 2072 samples


### Thus the Problem was with polarity=0 ! By Eliminating those sentiments our accuracy was very good.

In [12]:
from textblob import TextBlob

analysis = TextBlob("the book ws good")

In [13]:
print(analysis.sentiment)

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)


In [15]:
analysis.sentiment.polarity

0.7