In [1]:
import os
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob
from myconfig import *

In [2]:


class TwitterClient(object):
    def __init__(self):
        try:
            self.auth = OAuthHandler(twitterApiKey, twitterApiKeySecret)
            self.auth.set_access_token(twitterAccessToken, twitterAccessTokenSecret)
            self.api = tweepy.API(self.auth)
            assert self.api
        except:
            print("Error: Authentication Failed")
    
    
    def get_tweets(self, query, count = 10):
        tweets = []
        try:
            fetched_tweets = self.api.search_tweets(q = query, count = count)
            for tweet in fetched_tweets:
                parsed_tweet = {}
                parsed_tweet['text'] = tweet.text
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
                if tweet.retweet_count > 0:
                    if parsed_tweet not in tweets:
                        tweets.append(parsed_tweet)
                else:
                    tweets.append(parsed_tweet)
            return tweets
        except tweepy.TweepyException as e:
            print("Error : " + str(e))

    def fetch_tweets(self, query, count = 10):
        try:
            return self.api.search_tweets(q = query, count = count)
        except tweepy.TweepyException as e:
            print("Error : " + str(e))  

In [3]:
tc = TwitterClient()
#tc.fetch_tweets('#FarmLaws',1)

In [4]:
## The tweets have been fetched and stored in a file. 
## Use the cached tweets for consistent results instead of fetching new from twitter.
with open('farmer.txt', 'w') as f:
    tweets = tc.fetch_tweets('#FarmLaws',8000)
    for tweet in tweets:
        f.write(tweet.text + "_$_")

# 1. Data Preparation

## 1.1 Fetch from file

In [5]:

f = open('farmer.txt', 'r')
tweets = f.read().split('_$_')
f.close()

In [6]:
tweets[0]

'RT @Kisanektamorcha: Press Note \nDate: 14th Dec 2021 \n#FarmLaws #FarmersWon #1YearOfFarmersProtest https://t.co/iFMx7mnAzD'

# 1.2 Preprocess each tweet

In [7]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download
# download('stopwords')

def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    tweets_eng = []
    for word in tweets_clean:
        flag = True
        for i in word:
            if ord(i) >= 256:
                flag = False
                break
        if flag:
            tweets_eng.append(word)
    return tweets_eng

In [8]:
processed_tweets = [process_tweet(tweet) for tweet in tweets]
print(tweets[0])
print(processed_tweets[0])

RT @Kisanektamorcha: Press Note 
Date: 14th Dec 2021 
#FarmLaws #FarmersWon #1YearOfFarmersProtest https://t.co/iFMx7mnAzD
['press', 'note', 'date', '14th', 'dec', '2021', 'farmlaw', 'farmerswon', '1yearoffarmersprotest']


# 2. Feature Extraction

## 2.1 Word counts  

In [9]:
def count_words(tweet:list, freqs:dict, wordToTweet:dict):
    for word in tweet:
        if word in freqs:
            freqs[word] += 1
            wordToTweet[word].append(tweet)
        else:
            freqs[word] = 1
            wordToTweet[word] = [tweet]
    return freqs

In [10]:
freqs = dict()
wordToTweet = dict()
for tweet in processed_tweets:
    count_words(tweet, freqs, wordToTweet)

In [11]:
max(freqs.values())

78

# 3. Analysis

In [12]:
#!pip3 install numpy pandas seaborn
import numpy as np
import pandas as pd
import seaborn as sb

In [13]:
# freqs.keys()
# list(freqs.items())

In [14]:
len(freqs.keys())

119

In [15]:
most_freq_word = np.argmax(list(freqs.values()))
list(freqs.items())[most_freq_word]

('farmlaw', 78)

In [16]:

# Sorted in descending order of frequencies
freq_sorted = list(freqs.items())
freq_sorted.sort(key = lambda x : -x[1])
freq_sorted[:10]

[('farmlaw', 78),
 ('press', 65),
 ('note', 64),
 ('date', 64),
 ('14th', 64),
 ('dec', 64),
 ('2021', 64),
 ('farmerswon', 64),
 ('1yearoffarmersprotest', 64),
 ('farmer', 9)]