# [Twitter sentiment analysis using Python and NLTK](http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/)

> The purpose of the implementation is to be able to automatically classify a tweet as a positive or negative tweet sentiment wise.



## Implementation

In [1]:
pos_tweets = [('I love this car', 'positive'),
              ('This view is amazing', 'positive'),
              ('I feel great this morning', 'positive'),
              ('I am so excited about the concert', 'positive'),
              ('He is my best friend', 'positive')]

In [2]:
neg_tweets = [('I do not like this car', 'negative'),
              ('This view is horrible', 'negative'),
              ('I feel tired this morning', 'negative'),
              ('I am not looking forward to the concert', 'negative'),
              ('He is my enemy', 'negative')]

In [3]:
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3] 
    tweets.append((words_filtered, sentiment))

In [4]:
tweets

[(['love', 'this', 'car'], 'positive'),
 (['this', 'view', 'amazing'], 'positive'),
 (['feel', 'great', 'this', 'morning'], 'positive'),
 (['excited', 'about', 'the', 'concert'], 'positive'),
 (['best', 'friend'], 'positive'),
 (['not', 'like', 'this', 'car'], 'negative'),
 (['this', 'view', 'horrible'], 'negative'),
 (['feel', 'tired', 'this', 'morning'], 'negative'),
 (['not', 'looking', 'forward', 'the', 'concert'], 'negative'),
 (['enemy'], 'negative')]

In [5]:
test_tweets = [
    (['feel', 'happy', 'this', 'morning'], 'positive'),
    (['larry', 'friend'], 'positive'),
    (['not', 'like', 'that', 'man'], 'negative'),
    (['house', 'not', 'great'], 'negative'),
    (['your', 'song', 'annoying'], 'negative')]

## Classifier

In [7]:
import nltk

In [10]:

def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


In [11]:
word_features = get_word_features(get_words_in_tweets(tweets))

In [12]:
word_features

dict_keys(['love', 'this', 'car', 'view', 'amazing', 'feel', 'great', 'morning', 'excited', 'about', 'the', 'concert', 'best', 'friend', 'not', 'like', 'horrible', 'tired', 'looking', 'forward', 'enemy'])

In [13]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [18]:
document = {'contains(not)': False,
 'contains(view)': False,
 'contains(best)': False,
 'contains(excited)': False,
 'contains(morning)': False,
 'contains(about)': False,
 'contains(horrible)': False,
 'contains(like)': False,
 'contains(this)': True,
 'contains(friend)': False,
 'contains(concert)': False,
 'contains(feel)': False,
 'contains(love)': True,
 'contains(looking)': False,
 'contains(tired)': False,
 'contains(forward)': False,
 'contains(car)': True,
 'contains(the)': False,
 'contains(amazing)': False,
 'contains(enemy)': False,
 'contains(great)': False}

In [20]:
extract_features(document)

{'contains(about)': False,
 'contains(amazing)': False,
 'contains(best)': False,
 'contains(car)': False,
 'contains(concert)': False,
 'contains(enemy)': False,
 'contains(excited)': False,
 'contains(feel)': False,
 'contains(forward)': False,
 'contains(friend)': False,
 'contains(great)': False,
 'contains(horrible)': False,
 'contains(like)': False,
 'contains(looking)': False,
 'contains(love)': False,
 'contains(morning)': False,
 'contains(not)': False,
 'contains(the)': False,
 'contains(this)': False,
 'contains(tired)': False,
 'contains(view)': False}

In [21]:
training_set = nltk.classify.apply_features(extract_features, tweets)

In [22]:
training_set

[({'contains(love)': True, 'contains(this)': True, 'contains(car)': True, 'contains(view)': False, 'contains(amazing)': False, 'contains(feel)': False, 'contains(great)': False, 'contains(morning)': False, 'contains(excited)': False, 'contains(about)': False, 'contains(the)': False, 'contains(concert)': False, 'contains(best)': False, 'contains(friend)': False, 'contains(not)': False, 'contains(like)': False, 'contains(horrible)': False, 'contains(tired)': False, 'contains(looking)': False, 'contains(forward)': False, 'contains(enemy)': False}, 'positive'), ({'contains(love)': False, 'contains(this)': True, 'contains(car)': False, 'contains(view)': True, 'contains(amazing)': True, 'contains(feel)': False, 'contains(great)': False, 'contains(morning)': False, 'contains(excited)': False, 'contains(about)': False, 'contains(the)': False, 'contains(concert)': False, 'contains(best)': False, 'contains(friend)': False, 'contains(not)': False, 'contains(like)': False, 'contains(horrible)': Fa

In [27]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

### Here is a summary of what we just saw:

![alt text](http://www.laurentluce.com/images/blog/nltk/overview.png "summary")

In [35]:
classifier.show_most_informative_features(32)

Most Informative Features
           contains(not) = False          positi : negati =      1.6 : 1.0
       contains(looking) = False          positi : negati =      1.2 : 1.0
       contains(amazing) = False          negati : positi =      1.2 : 1.0
          contains(best) = False          negati : positi =      1.2 : 1.0
         contains(great) = False          negati : positi =      1.2 : 1.0
       contains(forward) = False          positi : negati =      1.2 : 1.0
        contains(friend) = False          negati : positi =      1.2 : 1.0
         contains(enemy) = False          positi : negati =      1.2 : 1.0
          contains(like) = False          positi : negati =      1.2 : 1.0
         contains(about) = False          negati : positi =      1.2 : 1.0
       contains(excited) = False          negati : positi =      1.2 : 1.0
      contains(horrible) = False          positi : negati =      1.2 : 1.0
          contains(love) = False          negati : positi =      1.2 : 1.0

## Classify

In [31]:
tweet = 'Larry is my friend'
print (classifier.classify(extract_features(tweet.split())))

positive


In [39]:
tweet = 'Your enemy is not friend'
classifier.classify(extract_features(tweet.split()))

'negative'