# Mounting Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#@title Set up Directory

# project directory
%cd '/content/drive/MyDrive/Soft-computing-tweet-summarization-analysis'

# event datasets
covid_tweet_dataset = "tweetid_sentiments_emotions.csv" #@param {type: "string"}
api_keys = "config.json" #@param {type: "string"}


/content/drive/MyDrive/Soft-computing-tweet-summarization-analysis


# Preview Covid19 Tweet Dataset
Repository Link:
Covid-19 Tweet Sentiment Analysis: https://www.openicpsr.org/openicpsr/project/120321/

Dataset Summary:
"This project aims to present a large dataset for researchers to discover public conversation on Twitter surrounding the COVID-19 pandemic. From 28 January 2020 to 1 September 2021, we collected over 198 million Twitter posts from more than 25 million unique users using four keywords: “corona”, “wuhan”, “nCov” and “covid”. Leveraging topic modeling techniques and pre-trained machine learning-based emotion analytic algorithms, we labeled each tweet with seventeen semantic attributes, including a) ten binary attributes indicating the tweet’s relevance or irrelevance to the top ten detected topics, b) five quantitative emotion attributes indicating the degree of intensity of the valence or sentiment (from 0: very negative to 1: very positive), and the degree of intensity of fear, anger, happiness and sadness emotions (from 0: not at all to 1: extremely intense), and c) two qualitative attributes indicating the sentiment category (very negative, negative, neutral or mixed, positive, very positive) and the dominant emotion category (fear, anger, happiness, sadness, no specific emotion) the tweet is mainly expressing."

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#@title Covid-19 dataset preview
covid_dataset=pd.read_csv(covid_tweet_dataset)
print("Columns:", covid_dataset.columns)
print("Length:", len(covid_dataset))
print("Null entries:", covid_dataset.isnull().sum())
covid_dataset.head()

Columns: Index(['tweet_ID', 'user_ID', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8',
       't9', 't10', 'valence_intensity', 'anger_intensity', 'fear_intensity',
       'sadness_intensity', 'joy_intensity', 'sentiment_category',
       'emotion_category', 'keyword_used', 'country_region', 'date_stamp'],
      dtype='object')
Length: 6166151
Null entries: tweet_ID              0
user_ID               0
t1                    0
t2                    0
t3                    0
t4                    0
t5                    0
t6                    0
t7                    0
t8                    0
t9                    0
t10                   0
valence_intensity     0
anger_intensity       0
fear_intensity        0
sadness_intensity     0
joy_intensity         0
sentiment_category    0
emotion_category      0
keyword_used          0
country_region        0
date_stamp            0
dtype: int64


Unnamed: 0,tweet_ID,user_ID,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,valence_intensity,anger_intensity,fear_intensity,sadness_intensity,joy_intensity,sentiment_category,emotion_category,keyword_used,country_region,date_stamp
0,1224743225916825600,600031424,1,0,0,0,0,0,0,0,0,0,0.466,0.462,0.575,0.48,0.293,negative,fear,wuhan,India,2020-02-05 00:00:00
1,1224742950401273858,964089407107080193,1,0,1,0,0,0,0,0,0,0,0.402,0.491,0.468,0.454,0.233,negative,anger,wuhan,India,2020-02-05 00:00:00
2,1224742938585944064,83182871,1,0,1,0,0,0,0,0,0,0,0.662,0.329,0.39,0.316,0.499,positive,joy,wuhan,India,2020-02-05 00:00:00
3,1224742733673185280,1212648845982650368,1,0,0,0,0,0,0,0,0,0,0.522,0.387,0.486,0.425,0.354,positive,joy,wuhan,India,2020-02-05 00:00:00
4,1224742511626702848,964089407107080193,1,1,0,0,0,0,0,0,0,0,0.436,0.435,0.465,0.428,0.252,negative,fear,wuhan,India,2020-02-05 00:00:00


In [None]:
#@title Covid-19 subset preview
covid_subset = pd.read_csv(covid_tweet_dataset, 
                           skiprows=range(1,540000), 
                           nrows=10000, 
                           usecols=[0,17,19,21])
print("Columns:", covid_subset.columns)
print("Length:", len(covid_subset))
print("Null entries:", covid_subset.isnull().sum())
covid_subset.head()

Columns: Index(['tweet_ID', 'sentiment_category', 'keyword_used', 'date_stamp'], dtype='object')
Length: 10000
Null entries: tweet_ID              0
sentiment_category    0
keyword_used          0
date_stamp            0
dtype: int64


Unnamed: 0,tweet_ID,sentiment_category,keyword_used,date_stamp
0,1241702585477693441,positive,covid,2020-03-22 00:00:00
1,1241702581795094528,neutral,covid,2020-03-22 00:00:00
2,1241702581300158471,negative,covid,2020-03-22 00:00:00
3,1241702577684623361,positive,covid,2020-03-22 00:00:00
4,1241702576975835136,positive,covid,2020-03-22 00:00:00


# Twitter API V2 Calls to Fetch Bulk Tweets

In [None]:
#
!pip install tweepy

In [None]:
#@title Run this cell to load your api keys
import json
with open(api_keys, 'r') as infile:
  keys = json.load(infile)

In [None]:
#@title Connecting to twitter using multiple accounts
from __future__ import print_function
import getopt
import logging
import os
import sys
from time import sleep
import tweepy

api = []
for each in keys:
  auth = tweepy.OAuthHandler(each['CONSUMER_KEY'], each['CONSUMER_SECRET'])
  auth.set_access_token(each['OAUTH_TOKEN'], each['OAUTH_TOKEN_SECRET'])
  api.append(tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True))

In [None]:
#@title Function to get tweets and store in csv file: get_tweets(tweet_dataframe,output_csv_file)
def get_tweets(tweet_list,output_csv):
  backoff_counter = 1
  count = 0
  HTTP_retry = 0
  _api = api[0]

  # getting full text from tweet ids
  for each in tweet_list.tweet_ID[:]:
    try:
      status = _api.get_status(id=each, include_entities=False, trim_user=True, tweet_mode='extended')
      tweet_list.at[count,'full_text'] = status.full_text
      tweet_list.iloc[[count]].to_csv(output_csv,mode='a',header=False)
      count+=1
      
      #error handle resets
      HTTP_retry=0
      if (count+1)%500 == 0:
        _api = api[0]
        print("Changed api")
      elif (count+1)%250 == 0:
        _api = api[1]
        print("Changed api")
        """
        e = tweepy.error.TweepError(api_code=429,reason="{'code':429,'message':'Manual rate limit reached.'}")
        e.args = [[{'code':429,'message':'Manual rate limit reached.'}]]
        backoff_counter = 1;
        raise(e)
        """
    except KeyboardInterrupt:
      print("Keyboard Interrupt. Done", 100*count/len(tweet_list),"%.")
      return count
    #except tweepy.error.RateLimitError:
      #if (count+1)%500 == 0:
        #_api = api[0]
        #print("Changed api")
      #elif (count+1)%250 == 0:
        #_api = api[1]
        #print("Changed api")
      #continue
    except tweepy.error.TweepError as e:
      #print("Done", 100*count/len(tweet_list),"%. Error", e.api_code, "-", e.args[0][0]['message'], end=" ")
      if e.api_code in [34,63,144,179,401,403,404]:
        # skipping tweet
        count+=1
        continue
      elif e.api_code == 429:
        print("Done", 100*count/len(tweet_list),"%. Error", e.api_code, "-", e.args[0][0]['message'], end=" ")
        print("Waiting for", 60*backoff_counter, "seconds.")
        sleep(60*backoff_counter)
        backoff_counter+=1
      elif e.api_code == 4104:
        if(HTTP_retry < 2):
          print("Connection reset by peer. Retrying ", HTTP_retry, "time.")
          HTTP_retry+=1
        else:
          print("Aborting due to connection error.")
          return count
      else:
        # logging unknown error
        print("Done", 100*count/len(tweet_list),"%. Error", e.api_code, "-", e.args[0][0]['message'], "Skipping.")
        count+=1
      continue
  
  print("Done", 100*count/len(tweet_list),"%.")
  
  print("Processed tweet ids:", count, " Remaining tweets:", len(tweet_list)-len(output))
  return count

# To get all tweets

In [None]:
#@title Making a file to save progress for continuous dataset loading, preprocessing
!touch download_progress.json

import json
info = {'covid_last_index':0,
        'covid_last_prep':0,
        'covid_size':6166151,
        'nquake_last_index':0,
        'nquake_last_prep':0,
        'nquake_size':100000}

with open('download_progress.json', 'w') as outfile:
     outfile.write(json.dumps(info, indent=4, sort_keys=True))

In [None]:
#title This loop takes 2-3 days to download all tweets completely. It calls the above function to resume from last downloaded tweet.
%mkdir -p covid_subset

import pandas as pd

# load progress
with open('download_progress.json', 'r') as infile:
  info = json.load(infile)

print("Last tweet downloaded till:", info['covid_last_index'])

# load  remaining dataset
subset = pd.read_csv(covid_tweet_dataset, skiprows=range(1,info['covid_last_index']-1))
subset = subset.reset_index(drop=True)
subset = subset.drop('country_region', axis=1)
subset = subset.drop('user_ID',axis=1)
subset = subset.iloc[:,[0,18,19]].copy()

# update progress
count = get_tweets(subset,'covid_subset/covid_subset.csv')
info['covid_last_index'] += count
with open('download_progress.json', 'w') as outfile:
  outfile.write(json.dumps(info, indent=4, sort_keys=True))

print("Dataset done", 100*info['covid_last_index']/info['covid_size'],"%.")


# To get all tweets between two dates

In [None]:
import pandas as pd
import numpy as np

#@title Grouping by dates, to see output open tweet_counts_by_dates.csv
date_groups = pd.read_csv(covid_tweet_dataset, usecols=[0,21]).groupby('date_stamp')
first = date_groups.get_group(list(date_groups.groups.keys())[0]).iloc[0,1][:-9]
last = date_groups.get_group(list(date_groups.groups.keys())[len(date_groups)-1]).iloc[-1,1][:-9]
print(first, "and", last)
print('number of days:', len(date_groups))
print('tweets\t date\t\tcumulative count')

count=0
group_data = []
for each in list(date_groups.groups.keys()):
  count+=len(date_groups.get_group(each))
  print(len(date_groups.get_group(each)),'\t',date_groups.get_group(each).iloc[0,1][:-9],'\t',count)
  group_data.append(dict(zip(np.array(['tweets','date','tweets_till_date']),np.array([len(date_groups.get_group(each)),date_groups.get_group(each).iloc[0,1][:-9],count]))))

group_df = pd.DataFrame(group_data)
group_df.to_csv('tweet_count_by_dates.csv',index=False)

In [None]:
#@title Get index range by month { run: "auto", vertical-output: true, display-mode: "both" }
#@markdown between 2020-01-28 and 2021-01-01
month = 12 #@param {type:"slider", min:1, max:13, step:1}
skip_days = sum([4,29,31,30,31,30,31,31,30,31,30,31,1][:month-1])
days = [4,29,31,30,31,30,31,31,30,31,30,31,1][month-1] + skip_days
first = date_groups.get_group(list(date_groups.groups.keys())[skip_days]).iloc[0]
last = date_groups.get_group(list(date_groups.groups.keys())[days]).iloc[-1]
print("till day",days,"\nfrom\n",first,'\nto\n',last)
skiprow=range(1,first.name)
nrow=last.name-first.name

In [None]:
#@title Example: fetching December 2020 dataset
import pandas as pd

count = 24700 #@param {type:"integer"}
subset = pd.read_csv(covid_tweet_dataset, skiprows=range(1,5970652+count), nrows=6020498-5970652+count, usecols=[0,19,21])
filename = "covid_subset/7_dec_13_dec_2020.csv" #@param {type:"string"}

# update progress
count += get_tweets(subset,filename)
print("Dataset done",count,"tweets.")

In [None]:
#@title To check how many days of tweets were downloaded, match this output with tweet_count_by_dates.csv file
import pandas as pd
filename = "covid_subset/7_dec_13_dec_2020.csv" #@param {type:"string"}
df=pd.read_csv(filename)
df.columns=['a','b','c','date','e']
df=df.groupby('date')
count = 0 
print('date_stamp\tcount\tcumulative_count')
for each in df.groups.keys():
  count+=len(df.get_group(each))
  print(each[:-9],'\t'+str(len(df.get_group(each))),'\t',str(count))

In [None]:
df.columns=['a','b','c','date','e']
a=df.groupby('e')
b=list(a.groups.keys())
#print(a.get_group(b[len(b)-1]),'\n---\n',df.iloc[43688])
df=df.drop_duplicates(subset=['e'])

In [None]:
count=0
for each in b:
  count+=len(a.get_group(each))
  print(each[:-9],'\t'+str(len(a.get_group(each))),'\t',str(count))

# To get live tweet data streams (Future Scope)

In [None]:
#create streaming object and authenticate
l = MyStreamListener()
stream =tweepy.Stream(auth,l)
#this line filters twiiter streams to capture data by keywords
stream.filter(track=['covid','corona','covid19','coronavirus','facemask','sanitizer','social-distancing'])

In [None]:
%pip install twarc
%pip install jsonlines

In [None]:
#@title Insert API Keys here
from twarc import Twarc

consumer_key = "Q7M1nSdsS8M0FjfMf2pgvE8ri" #@param {type:"string"}
consumer_secret = "9ftWbJtiJqsRRVbPVMlkLg1haOxVg4qpoEnP6TdOxAQ9YCbPsF" #@param {type:"string"}
access_token = "1248089353017905152-6XfrjOSINjpqudTHkdLbB6y8iFOtfa" #@param {type:"string"}
access_token_secret = "R1DivTpuk3b7Vuo4uilXVYrmp6ZpYFYWIfFD7deukgf9u" #@param {type:"string"}

t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

In [None]:
#@title Set up Directory
final_tweet_ids_filename = "tweetsidd.txt" #@param {type: "string"}
output_filename = "output.csv" #@param {type: "string"}

# Preprocess and split any datasets into train and test for ML-based methods (Future Scope)

In [None]:
import pandas as pd

# add headers before pre-processing, do not overwrite
tweets = pd.read_csv('covid_subset/covid_subset.csv', lineterminator='\n', header=None)
tweets.columns=['index','tweet_ID','keyword_used','date_stamp','full_text']

# remove tweet IDs, and old index
#tweets=tweets.drop(labels=['index','tweet_ID'], axis=1)

# reset index
tweets=tweets.reset_index()

# remove duplicate entries
#tweets=tweets.drop_duplicates(subset=['tweet_ID','full_text'], keep='first')

list(tweets.full_text.head())

# incomplete