In [1]:
#### TEAMWORK FOR CLEANING ####
import nltk
import re, string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk import FreqDist
import random
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import DecisionTreeClassifier

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
stop_words = stopwords.words('english')

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*(),]|'
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
            
all_pos_words = get_all_words(positive_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

# Split the data in train and test
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)


In [2]:
def getFeatureVector(dataset):
    featureVector = []

    words = dataset.split() # split tweet into words
    for w in words:
        
        w = replaceTwoOrMore(w)  # replace two or more with two occurrences
        
        w = w.strip('\'"?,.') # strip punctuation
        
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w) # check if stats with an alphabet
        
        if(w in stopWords or val is None): # ignore stop word
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

In [3]:
def featureExtraction():
    featureVector = getFeatureVector(dataset)
    dataset.append((featureVector, sentiment))
    return dataset

In [4]:
def get_words_in_tweets(dataset):
    all_words = []
    for (text, sentiment) in dataset:
        all_words.extend(text)
    return all_words

In [5]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist) # frequency distrubtion of all words
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_tweets(dataset))

In [6]:
def extract_features(dataset):
    settweet = set(dataset)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in settweet)
    return features

In [7]:
test_set = nltk.classify.apply_features(extract_features, dataset[:3000])
train_set = nltk.classify.apply_features(extract_features, dataset[3000:])

In [8]:
classifier_nb = NaiveBayesClassifier.train(train_set)

print("Accuracy is:", classify.accuracy(classifier_nb, test_set))

print(classifier_nb.show_most_informative_features(10))

Accuracy is: 0.9953333333333333
Most Informative Features
            contains(:() = True           Negati : Positi =   2071.1 : 1.0
            contains(:)) = True           Positi : Negati =   1661.3 : 1.0
           contains(sad) = True           Negati : Positi =     24.1 : 1.0
      contains(follower) = True           Positi : Negati =     23.8 : 1.0
       contains(welcome) = True           Positi : Negati =     23.8 : 1.0
           contains(bam) = True           Positi : Negati =     23.7 : 1.0
          contains(glad) = True           Positi : Negati =     22.3 : 1.0
        contains(arrive) = True           Positi : Negati =     19.0 : 1.0
      contains(followed) = True           Negati : Positi =     17.8 : 1.0
           contains(x15) = True           Negati : Positi =     17.7 : 1.0
None


In [9]:
# pip_install tweepy
# pip_install pandas
# pip_install numpy
# pip install matplotlib
# pip install textblob

from textblob import TextBlob
import tweepy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

# Our Twitter API Credentials:
# (Normally, we store these in a seperate file for precautionary measures.
#  However, for the sake of simplicity, we are providing them here.)
Consumer_Key = 'j1PZszWJPTn50m9CyZaKQEL4V'
Consumer_Secret = '0VLZXNHBidBXOYELBSmAJnWpsohTlMQ2flop7cGY4IRd7gRPc8'        
Access_Token = '1334510916344360963-6ESgNupKmKn8yuCEEpi91XLO7cXttC'    
Access_Token_Secret = 'O4ghcRniTGXb9V7LIcfW5CpK93Pc6yfTiRY5vxSZReHuq'

class Twitter_Customer():

    def __init__(self, twitter_user = None):
        self.auth = Authentication().authenticate_twitter_app()
        self.twitter_client = tweepy.API(self.auth)
        self.twitter_user = twitter_user

    def twitter_customer_api(self):
        return self.twitter_client

    
class Authentication():

    def authenticate_twitter_app(self):
        auth = tweepy.OAuthHandler(Consumer_Key, Consumer_Secret)
        auth.set_access_token(Access_Token, Access_Token_Secret)
        return auth

    
class My_Stream_Listener():

    def __init__(self):
        self.twitter_autenticator = Authentication()    

    def stream_tweets(self, fetched_tweets_filename):
        listener = TwitterListener(fetched_tweets_filename)
        auth = self.twitter_autenticator.authenticate_twitter_app() 
        stream = tweepy.Stream(auth, listener)
        
class TwitterListener(tweepy.StreamListener):

    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            print(data)
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True
          
    def on_error(self, status):
        if status == 420:
            return False
        print(status)
        
class TweetAnalyzer():

    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT[\s]+)", " ", tweet).split())

    def analyze_sentiment_polarity(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))
        
        if analysis.sentiment.polarity < 0:
            return 'Negative'
        elif analysis.sentiment.polarity == 0:
            return 'Neutral'
        else:
            return 'Positive'
    
    def analyze_sentiment_subjectivity(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))
        return round(analysis.sentiment.subjectivity, 2)

    def tweets_to_data_frame(self, tweets):
        df = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['tweets'])
        df['date'] = np.array([tweet.created_at for tweet in tweets])
        df['len'] = np.array([len(tweet.text) for tweet in tweets])
        df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
        df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])
        return df
 
if __name__ == '__main__':

    twitter_client = Twitter_Customer()
    tweet_analyzer = TweetAnalyzer()

    api = twitter_client.twitter_customer_api()

    #tweets = api.user_timeline(screen_name = "TheRock", count=200)
    tweets = api.search(q = "#Apple",count = 100, lang = "en")
    
    df = tweet_analyzer.tweets_to_data_frame(tweets)
    df['sentiment'] = np.array([tweet_analyzer.analyze_sentiment_polarity(tweet) for tweet in df['tweets']])
    df['subjectivity'] = np.array([tweet_analyzer.analyze_sentiment_subjectivity(tweet) for tweet in df['tweets']])
    df.to_csv('twitter_data.csv')

#df = pd.read_csv('twitter_data.csv')
#df.head()

In [11]:
df.shape

(79, 7)

In [12]:
def remove_URL(headline_text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', headline_text)

In [14]:
df['tweets']=df['tweets'].apply(remove_URL)

In [16]:
def remove_html(headline_text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',headline_text)

df['tweets']=df['tweets'].apply(remove_html)


In [17]:
import re, string

In [18]:
def remove_punct(headline_text):
    table=str.maketrans('','',string.punctuation)
    return headline_text.translate(table)
df['tweets']=df['tweets'].apply(remove_punct)

In [19]:
df.head(3)

Unnamed: 0,tweets,date,len,likes,retweets,sentiment,subjectivity
0,RT 907kMarketing What ya’ll think of the apple...,2020-12-09 03:59:23,127,0,1,Positive,0.1
1,Come on Apple Really 549 for headphones They B...,2020-12-09 03:59:06,120,0,0,Positive,0.35
2,I used the Apple HomePodMini speakers to liste...,2020-12-09 03:58:47,140,0,0,Positive,0.53


In [20]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return df['tweets']==df['tweets'].apply(remove_stopwords)

In [21]:
df.head(3)

Unnamed: 0,tweets,date,len,likes,retweets,sentiment,subjectivity
0,RT 907kMarketing What ya’ll think of the apple...,2020-12-09 03:59:23,127,0,1,Positive,0.1
1,Come on Apple Really 549 for headphones They B...,2020-12-09 03:59:06,120,0,0,Positive,0.35
2,I used the Apple HomePodMini speakers to liste...,2020-12-09 03:58:47,140,0,0,Positive,0.53


In [22]:
normalization = None
normalization = 'stemmer'
normalization = 'lemmatizer'
def stem_tokens(tokens):
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens
def lemmatize_tokens(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens
def normalize_tokens(normalization):
    if normalization is not None:
        if normalization == 'stemmer':
            df['tweets']==df['tweets'].apply(stem_tokens)
        elif normalization == 'lemmatizer':
            df['tweets']==df['tweets'].apply(lemmatize_tokens)
        
normalize_tokens(normalization)

In [25]:
postitive_data = list(df[df['sentiment'] == 'Positive']['tweets'].apply(lambda x : x.split()))
negative_data = list(df[df['sentiment'] == 'Negative']['tweets'].apply(lambda x : x.split()))

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in postitive_data:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_data:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
            
all_pos_words = get_all_words(positive_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

# Split the data in train and test
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset


test_data = dataset

print("Accuracy is:", classify.accuracy(classifier_nb, test_data))

print(classifier_nb.show_most_informative_features(10))

Accuracy is: 0.21428571428571427
Most Informative Features
            contains(:() = True           Negati : Positi =   2071.1 : 1.0
            contains(:)) = True           Positi : Negati =   1661.3 : 1.0
           contains(sad) = True           Negati : Positi =     24.1 : 1.0
      contains(follower) = True           Positi : Negati =     23.8 : 1.0
       contains(welcome) = True           Positi : Negati =     23.8 : 1.0
           contains(bam) = True           Positi : Negati =     23.7 : 1.0
          contains(glad) = True           Positi : Negati =     22.3 : 1.0
        contains(arrive) = True           Positi : Negati =     19.0 : 1.0
      contains(followed) = True           Negati : Positi =     17.8 : 1.0
           contains(x15) = True           Negati : Positi =     17.7 : 1.0
None
