#Importing NLTK

In [None]:
import nltk
nltk.download("all")

#Cleaning Data

In [47]:
from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.tokenized("positive_tweets.json")
negative_tweets = twitter_samples.tokenized("negative_tweets.json")
print(positive_tweets[0])
print(negative_tweets[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hopeless', 'for', 'tmr', ':(']


In [48]:
def clean_data(token):
  return [item for item in token if not item.startswith("@") and not item.startswith("http")]

def to_lower(token):
  return [item.lower() for item in token]

positive_tweets = [clean_data(to_lower(item)) for item in positive_tweets]
negative_tweets = [clean_data(to_lower(item)) for item in negative_tweets]

print(positive_tweets[0])
print(negative_tweets[0])

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hopeless', 'for', 'tmr', ':(']


In [49]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

pos_tag(positive_tweets[0])

[('#followfriday', 'NN'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

##Lemmatizing Data

In [50]:
def lemmatize(token):
  lemmatizer = WordNetLemmatizer()

  result = []
  for item, tag in pos_tag(token):
    if tag[0].lower() in "nva":
      result.append(lemmatizer.lemmatize(item, tag[0].lower()))
    else:
      result.append(lemmatizer.lemmatize(item))
  
  return result
positive_tweets = [lemmatize(item) for item in positive_tweets]
negative_tweets = [lemmatize(item) for item in negative_tweets]

In [51]:
positive_tweets[0]

['#followfriday',
 'for',
 'be',
 'top',
 'engage',
 'member',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

##Removing Stop-Words

In [52]:
from nltk.corpus import stopwords

def remove_stopwords(token, stop_words):
  return [item for item in token if item not in stop_words]

stop_words = stopwords.words('english')
positive_tweets = [remove_stopwords(item, stop_words) for item in positive_tweets]
negative_tweets = [remove_stopwords(item, stop_words) for item in negative_tweets]

In [53]:
positive_tweets[0]

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

In [54]:
def transform_feature(token):
  feature_set = {}
  for feature in token:
    if feature not in feature_set:
      feature_set[feature] = 0
    feature_set[feature] += 1
  return feature_set

positive_tweets = [(transform_feature(item), "Positive") for item in positive_tweets]
negative_tweets = [(transform_feature(item), "Negative") for item in negative_tweets]

In [55]:
positive_tweets[0]

({'#followfriday': 1,
  ':)': 1,
  'community': 1,
  'engage': 1,
  'member': 1,
  'top': 1,
  'week': 1},
 'Positive')

#Creating Training and testing datasets

Then creating the model.

In [56]:
import random
dataset = positive_tweets + negative_tweets
random.shuffle(dataset)

training_data = dataset[:7000]
test_data = dataset[7000:]

In [57]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [58]:
from nltk import classify
print("Accuracy: ", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy:  0.994
Most Informative Features
                      :( = 1              Negati : Positi =   2031.6 : 1.0
                      :) = 1              Positi : Negati =   1589.9 : 1.0
                       ( = 2              Negati : Positi =     46.4 : 1.0
                follower = 1              Positi : Negati =     23.8 : 1.0
                     sad = 1              Negati : Positi =     23.3 : 1.0
                    glad = 1              Positi : Negati =     22.0 : 1.0
                  arrive = 1              Positi : Negati =     19.9 : 1.0
                     bam = 1              Positi : Negati =     18.7 : 1.0
                     x15 = 1              Negati : Positi =     17.9 : 1.0
                   enjoy = 1              Positi : Negati =     17.3 : 1.0
None
