In [41]:
from nltk.corpus import twitter_samples, stopwords
from nltk import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk import NaiveBayesClassifier, classify
import random
import pickle

In [39]:
def clean_data(token):
    return [item.lower() for item in token if not item.startswith('@') and not item.startswith('http')]

def lemmatize(token):
    lemmatizer = WordNetLemmatizer()
    
    result = []
    for item, tag in pos_tag(token):
        if tag[0].lower() in "nva":
            result.append(lemmatizer.lemmatize(item, tag[0].lower()))
        else:
            result.append(lemmatizer.lemmatize(item))
    return result

def remove_stop_words(token):
    stop_words = stopwords.words('english')
    return [item for item in token if item not in stop_words]

def transform_features(token):
    features_set = {}
    for feature in token:
        if feature not in features_set:
            features_set[feature] = 0
        features_set[feature] = 1
    return features_set

def main():
                       #Gather Data
    positive_tweets =  twitter_samples.tokenized('positive_tweets.json')
    negative_tweets =  twitter_samples.tokenized('negative_tweets.json')
    
                        #Cleaning, Lemmatization and remove stop words
    positive_tweets = [remove_stop_words(lemmatize(clean_data(item))) for item in positive_tweets]
    negative_tweets = [remove_stop_words(lemmatize(clean_data(item))) for item in negative_tweets]
    
    
    
                        #Transform Data
    positive_tweets = [(transform_features(token), "Positive") for token in positive_tweets]
    negative_tweets = [(transform_features(token), "Negative") for token in negative_tweets]
    print(positive_tweets[:5])
    print("{{{{{{{{{{}}}}}}}}}}")
    print(negative_tweets[:5])
    
    
                        #Create Dataset
    dataset = positive_tweets + negative_tweets
    random.shuffle(dataset)
    
    training_data = dataset[:7000]
    test_data = dataset[7000:]
    
                        #Traning 
    classifier =  NaiveBayesClassifier.train(training_data) 
    print("Accuracy", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))
    
    with open("model.pkl", "wb") as f:
        pickle.dump(classifier, f)

In [40]:
main()

[({'#followfriday': 1, 'top': 1, 'engage': 1, 'member': 1, 'community': 1, 'week': 1, ':)': 1}, 'Positive'), ({'hey': 1, 'james': 1, '!': 1, 'odd': 1, ':/': 1, 'please': 1, 'call': 1, 'contact': 1, 'centre': 1, '02392441234': 1, 'able': 1, 'assist': 1, ':)': 1, 'many': 1, 'thanks': 1}, 'Positive'), ({'listen': 1, 'last': 1, 'night': 1, ':)': 1, 'bleed': 1, 'amazing': 1, 'track': 1, '.': 1, 'scotland': 1, '?': 1, '!': 1}, 'Positive'), ({'congrats': 1, ':)': 1}, 'Positive'), ({'yeaaaah': 1, 'yippppy': 1, '!': 1, 'accnt': 1, 'verify': 1, 'rqst': 1, 'succeed': 1, 'get': 1, 'blue': 1, 'tick': 1, 'mark': 1, 'fb': 1, 'profile': 1, ':)': 1, '15': 1, 'day': 1}, 'Positive')]
{{{{{{{{{{}}}}}}}}}}
[({'hopeless': 1, 'tmr': 1, ':(': 1}, 'Negative'), ({'everything': 1, 'kid': 1, 'section': 1, 'ikea': 1, 'cute': 1, '.': 1, 'shame': 1, "i'm": 1, 'nearly': 1, '19': 1, '2': 1, 'month': 1, ':(': 1}, 'Negative'), ({'heart': 1, 'slide': 1, 'waste': 1, 'basket': 1, '.': 1, ':(': 1}, 'Negative'), ({'“': 1, ':