# 1 TWITTER API

In [2]:
import tweepy
import webbrowser
import time
import re
from myconfig import *
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download
from textblob import TextBlob
import numpy as np
import pandas as pd
import seaborn as sb

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download

In [3]:
#Twitter class for fetching tweets

class TwitterClient(object):
    def __init__(self):
        try:
            self.auth = tweepy.OAuthHandler(twitterApiKey,twitterApiKeySecret)
            self.auth.set_access_token(twitterAccessToken,twitterAccessTokenSecret)
            self.api = tweepy.API(self.auth)
            assert self.api
        except:
            print("Error: Authentication Failed")
    
    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    
    def get_tweet_sentiment(self, tweet):
        # create TextBlob object of passed tweet text
        analysis = TextBlob(self.clean_tweet(tweet))
        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 'positive'
        elif analysis.sentiment.polarity == 0:
            return 'neutral'
        else:
            return 'negative'
    
    def get_tweets(self, query, count = 10):
        tweets = []
        try:
            fetched_tweets = self.api.search_tweets(q = query, count = count)
            for tweet in fetched_tweets:
                all_english = True
                for c in tweet['text']:
                    if ord(c) >= 256:
                        all_english = False
                        break
                if not all_english:
                    continue
                parsed_tweet = {}
                parsed_tweet['text'] = tweet.text
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
                if tweet.retweet_count > 0:
                    if parsed_tweet not in tweets:
                        tweets.append(parsed_tweet)
                else:
                    tweets.append(parsed_tweet)
            return tweets
        except tweepy.TweepyException as e:
            print("Error : " + str(e))

    def fetch_tweets(self, query, count = 10):
        try:
            return self.api.search_tweets(q = query, count = count)
        except tweepy.TweepyException as e:
            print("Error : " + str(e))


In [4]:
tc = TwitterClient()

Error: Authentication Failed


In [5]:
with open('tweets.txt','w') as f:
    tweets = tc.fetch_tweets('#Modi',100)
    for tweet in tweets:
        f.write(tweet.text+"_$_")

AttributeError: 'TwitterClient' object has no attribute 'api'

# 2 Data Pre-processing

# 2.1 Reading Tweets

In [5]:
f = open('tweets.txt', 'r')
tweets = f.read().split('_$_')
f.close()
tweets[0]

'தேர்தல் சமயத்தில்தான் பாஜக கங்கையில் நீராடும்; தொற்றால் மக்கள் உயிரிழந்தபோது அதே கங்கையில் சடலங்களை தூக்கி எறிவார்க… https://t.co/yz7rKy7kB8'

# 2.2 Pre-processing each tweet

In [6]:
def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    # removing non-english words (words of other languages likes Hindi etc)
    tweets_eng = []
    for word in tweets_clean:
        flag = True
        for i in word:
            # letter > 256 => non-english letter => remove word
            if ord(i) >= 256:
                flag = False
                break
        if flag:
            tweets_eng.append(word)
    return tweets_eng

In [7]:
processed_tweets = [process_tweet(tweet) for tweet in tweets]
print(tweets[0])
print(processed_tweets[0])

தேர்தல் சமயத்தில்தான் பாஜக கங்கையில் நீராடும்; தொற்றால் மக்கள் உயிரிழந்தபோது அதே கங்கையில் சடலங்களை தூக்கி எறிவார்க… https://t.co/yz7rKy7kB8
[]


# 3 Feature Extraction

In [8]:
def count_words(tweet:list, freqs:dict, wordToTweet:dict):
    for word in tweet:
        if word in freqs:
            freqs[word] += 1
            wordToTweet[word].append(tweet)
        else:
            freqs[word] = 1
            wordToTweet[word] = [tweet]
    return freqs

In [9]:
freqs = dict()
wordToTweet = dict()
for tweet in processed_tweets:
    count_words(tweet, freqs, wordToTweet)

In [10]:
max(freqs.values())

63

# 4 Analysis

In [11]:
len(freqs.keys())

355

In [12]:
most_freq_word = np.argmax(list(freqs.values()))
list(freqs.items())[most_freq_word]

('ekadashi', 63)

In [13]:

# Sorted in descending order of frequencies
freq_sorted = list(freqs.items())
freq_sorted.sort(key = lambda x : -x[1])
freq_sorted[:10]

[('ekadashi', 63),
 ('modi', 49),
 ('shukla', 42),
 ('paksha', 42),
 ('day', 23),
 ('11', 22),
 ('1/3', 21),
 ('today', 21),
 ('guruvayoor', 21),
 ('vrishikam', 21)]

# 5 Clustering

In [14]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v

def normalize(v):
    norm = np.power(np.sum(np.power(v,2)), 0.5)
    if norm == 0:
        return 0
    return v / norm

In [15]:
from collections import Counter
def len_counts(clusters):
    lens = [len(cluster) for cluster in clusters.values()]
    return dict(Counter(lens))


In [16]:
def get_avg_dist(clusterA, clusterB, vectors):
    dist_sum = 0
    for i in clusterA:
        for j in clusterB:
            dist_sum += np.sum(np.power(vectors[i] - vectors[j], 2))
    
    dist_avg = dist_sum / (len(clusterA) * len(clusterB))
    return dist_avg

In [17]:
def mergeOnce(clusters, vectors):
    min_dist = 2**30
    min_loc = (0, 0)
    for clusterInxA in clusters:
        for clusterInxB in clusters:
            if clusterInxA != clusterInxB:
                dist = get_avg_dist(clusters[clusterInxA], clusters[clusterInxB], vectors)
                if dist < min_dist:
                    min_loc = (clusterInxA, clusterInxB)
                    min_dist = dist

    clusters[min_loc[0]] = clusters[min_loc[0]] + clusters[min_loc[1]]
    clusters.pop(min_loc[1])

def mergeToK(clusters, vectors, K):
    while len(clusters.keys()) > K:
        mergeOnce(clusters, vectors)

In [18]:
def display_unique_tweets(tweets, cluster):
    c_tweets = [tweets[i] for i in cluster]
    for i in range(len(c_tweets)):
        if not c_tweets[i] in c_tweets[:i]:
            print(c_tweets[i])

In [19]:
alphabet = [i[0] for i in freq_sorted]
n = len(processed_tweets)
m = len(alphabet)
clusters = dict()
vectors = dict()
for i in range(n):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
    clusters[i] = [i]
mergeToK(clusters, vectors, 15)

In [20]:
print("lengths:")
print({cluster:len(clusters[cluster]) for cluster in clusters})
print("len counts:")
print(len_counts(clusters))


lengths:
{0: 66, 20: 21, 28: 1, 32: 1, 34: 1, 44: 1, 50: 1, 60: 1, 62: 2, 75: 1, 79: 1, 82: 1, 87: 1, 89: 1, 90: 1}
len counts:
{66: 1, 21: 1, 1: 12, 2: 1}


# Classification of Tweets into Positive or Negative

In [6]:
tweets = tc.get_tweets(query = 'ssr', count = 10000)
# picking positive tweets from tweets
ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive']
# percentage of positive tweets
print("Positive tweets percentage: {} %".format(100*len(ptweets)/len(tweets)))
# picking negative tweets from tweets
ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative']
# picking neutral tweets from tweets

neutweets = [tweet for tweet in tweets if tweet['sentiment'] == 'neutral']

# percentage of negative tweets
print("Negative tweets percentage: {} %".format(100*len(ntweets)/len(tweets)))
# percentage of neutral tweets
print("Neutral tweets percentage: {} % \
    ".format(100*(len(tweets) -(len( ntweets )+len( ptweets)))/len(tweets)))
# printing first 5 positive tweets
print("\n\nPositive tweets:")
for tweet in ptweets[:10]:
    print(tweet['text'])
  
 # printing first 5 negative tweets
print("\n\nNegative tweets:")
for tweet in ntweets[:10]:
    print(tweet['text'])
    
# printing first 5 neutral tweets
print("\n\nNeutral tweets:")
for tweet in neutweets[:10]:
    print(tweet['text'])

AttributeError: 'TwitterClient' object has no attribute 'api'