# MOUNTING DRIVE

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#@title Set up Directory

# project directory
%cd '/content/drive/MyDrive/Soft-computing-eXtended'

/content/drive/MyDrive/Soft-computing-eXtended


In [None]:
import pandas as pd
covid_tweet_dataset = "tweetid_sentiments_emotions.csv"
api_keys = "config.json" #@param {type: "string"}
plib_dir = "/content/drive/MyDrive/plib"

# Dataset

In [None]:
#@title Covid-19 subset preview
covid_subset = pd.read_csv(covid_tweet_dataset, 
                           #skiprows=range(1,5932184), 
                           #nrows=6158058-5932184, 
                           usecols=[0,17,19,21])


In [None]:
covid_subset.sample(10)

In [None]:
print("Columns:", covid_subset.columns)
print("Length:", len(covid_subset))
print("Null entries:\n", covid_subset.isnull().sum().sum())

Columns: Index(['tweet_ID', 'sentiment_category', 'keyword_used', 'date_stamp'], dtype='object')
Length: 6166151
Null entries:
 0


# Functions to Fetch Bulk Tweets

### Loading all API keys in a list

To connect to twitter via multiple accounts

In [None]:
!pip -q install tweepy==4.0.0

[?25l[K     |█████▍                          | 10 kB 25.9 MB/s eta 0:00:01[K     |██████████▉                     | 20 kB 30.4 MB/s eta 0:00:01[K     |████████████████▎               | 30 kB 36.8 MB/s eta 0:00:01[K     |█████████████████████▊          | 40 kB 15.0 MB/s eta 0:00:01[K     |███████████████████████████▏    | 51 kB 16.5 MB/s eta 0:00:01[K     |████████████████████████████████| 60 kB 4.8 MB/s 
[?25h

In [None]:
!pip -q install tweepy --upgrade

[K     |████████████████████████████████| 77 kB 4.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.27.1 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [None]:
from __future__ import print_function
import getopt
import logging
import os
import sys
from time import sleep
import tweepy

import json
with open(api_keys, 'r') as infile:
  keys = json.load(infile)

api = []
for each in keys:
  auth = tweepy.OAuthHandler(each['CONSUMER_KEY'], each['CONSUMER_SECRET'])
  auth.set_access_token(each['OAUTH_TOKEN'], each['OAUTH_TOKEN_SECRET'])
  api.append(tweepy.API(auth,wait_on_rate_limit=True))#, wait_on_rate_limit_notify=True))

### status_full_text(status):
Function to get specific columns from tweet ResponseSet.Status object


In [None]:
def status_full_text(status):
  if hasattr(status, "retweeted_status"):  # Check if Retweet
    try:
      return(status.retweeted_status.extended_tweet["full_text"])
    except AttributeError:
      try:
        return status.retweeted_status.full_text
      except:
        return None
  else:
    try:
      return status.extended_tweet["full_text"]
    except AttributeError:
      try:
        return status.full_text
      except:
        return None

### get_tweets(tweet_list_dataframe, output_file):

Function to bulk fetch tweet_IDs from tweet_list DataFrame and APPEND full_text to output_file CSV

In [None]:
def get_tweets(tweet_list,output_csv):
  backoff_counter = 1
 
  HTTP_retry = 0
  current_api = 0
  _api = api[current_api]

  tweet_count = len(tweet_list) #total tweets to be downloaded
  count = 0 #downloaded tweets count
  batch_size = 50 #batch size to download

  # getting full text from tweet ids
  for i in range(0,tweet_count,batch_size):
    try:
      batch = list(tweet_list.tweet_ID[i:i+batch_size])
      statuses = _api.lookup_statuses(batch, include_entities=False, trim_user=True, tweet_mode='extended', 
                                      map=True, include_ext_alt_text=False, include_card_uri=False) #tweepy v4.7.*

      # assign full_text to tweet_list 
      full_texts = [status_full_text(status) for status in statuses]
      df = tweet_list[i:i+batch_size].copy()
      if(len(full_texts) != len(df)):
        print('len(received):',len(full_texts))
        print('len(queried):',len(df))
        return full_texts,df
        print(pd.merge(full_texts, df, on='tweet_ID'))
        #for t in range(len(full_texts)):
          #if full_texts[t].tweet_ID != df.iloc[t].tweet_ID:
        input()
        continue
      df = pd.DataFrame(df, index = df.index, columns = df.columns)
      df['full_text'] = full_texts
      #tweet_list['full_text'][i:i+batch_size] = full_texts

      # append to file
      #tweet_list.iloc[i:i+batch_size].to_csv(output_csv,mode='a',header=False)
      df.to_csv(output_csv,mode='a',header=False)
      count += batch_size
      
      # switch api credentials on approaching limit
      HTTP_retry = 0
      if (i+1)%2500==0:
        if current_api==0: current_api = 1
        else: current_api = 0
        _api = api[current_api]
        print("Changed api to", current_api, "Done", 100*i/tweet_count,"%.")
    # error handling
    except KeyboardInterrupt:
      print("Keyboard Interrupt. Done", 100*i/tweet_count,"%.")
      break
    except tweepy.errors.TweepyException as e:
      #print("Done", 100*count/len(tweet_list),"%. Error", e.api_code, "-", e.args[0][0]['message'], end=" ")
      if e.api_code in [34,63,144,179,401,403,404]:
        # skipping bad tweets
        count+=batch_size
        continue

      elif e.api_code == 429:
        print("Done", 100*i/tweet_count,"%. Error", e.api_code, "-", e.args[0][0]['message'], end=" ")
        print("Waiting for", 60*backoff_counter, "seconds.")
        sleep(60*backoff_counter)
        backoff_counter += 1

      elif e.api_code == 4104:
        if(HTTP_retry < 2):
          print("Connection reset by peer. Retrying ", HTTP_retry, "time.")
          HTTP_retry += 1
        else:
          print("Aborting due to connection error.")
          break

      else:
        # logging unknown error
        print("Done", 100*i/tweet_count,"%.", end = " ")
        try:
          print("Error", e.api_code, "-", e.args[0][0]['message'], "Skipping.")
        except:
          # skipping unknown next level error
          continue
        count += batch_size

      continue
  
  print("Done", 100*i/tweet_count,"%.")
  
  print("Processed tweet ids:", i, " Remaining tweets:", tweet_count-len(output_csv))
  return i

### resetProgress(progress_file_name, tweet_count)
Function to reset the progress file to restart streaming

In [None]:
#@title Making a file to save progress for continuous dataset loading, preprocessing
def resetProgress(progress_file, size):
  import json
  info = {'covid_last_index':0,
          'covid_last_prep':0,
          'covid_size':size}

  with open(progress_file, 'w') as outfile:
      outfile.write(json.dumps(info, indent=4, sort_keys=True))
  
  %mkdir -p covid_subset

### getTweetsAsync(tweet_id_csv_file, progress_json_file, save_csv_file):

This function loads last session progress, and uses the get_tweets() function to download remaining tweets.

In case of any error or manual interruptions, it saves the progress before quitting.

In [None]:
def getTweetsAsync(csv_file, progress_file, save_file):
  import pandas as pd

  # load progress
  with open(progress_file, 'r') as infile:
    info = json.load(infile)

  import pandas as pd

  # load progress
  with open(progress_file, 'r') as infile:
    info = json.load(infile)

  print("Last tweet downloaded till:", info['covid_last_index'])

  # load remaining dataset
  subset = pd.read_csv(csv_file, skiprows=range(1,info['covid_last_index']+1))
  subset = subset.reset_index(drop=True)

  if(info['covid_last_index'] == 0):
    subset['full_text'] = pd.Series([None], index=[0])
  
  # update progress
  count = get_tweets(subset, save_file)
  return count
  info['covid_last_index'] += count
  with open(progress_file, 'w') as outfile:
    outfile.write(json.dumps(info, indent=4, sort_keys=True))

  print("Dataset done", 100*info['covid_last_index']/info['covid_size'],"%.")

# DOWNLOADING DECEMBER 2020 TWEETS

### Grouping by dates

First we group the main dataset by dates, to see output open tweet_counts_by_dates.csv

Next we can download tweets between this range

In [None]:
import pandas as pd
import numpy as np

date_groups = pd.read_csv(covid_tweet_dataset, usecols=[0,17,18,21]).groupby('date_stamp')
first = date_groups.get_group(list(date_groups.groups.keys())[0]).iloc[0].date_stamp[:-9]
last = date_groups.get_group(list(date_groups.groups.keys())[len(date_groups)-1]).iloc[-1].date_stamp[:-9]
print("Between", first, "and", last)
print('number of days:', len(date_groups))

Between 2020-01-28 and 2021-01-01
number of days: 340


In [None]:
print('tweets\t date\t\tcumulative_count\tfrom_tweet_ID')
print(list(date_groups.groups.keys())[308])

tweets	 date		cumulative_count	from_tweet_ID
2020-12-03 00:00:00


In [None]:
count=0
group_data = []

for each in list(date_groups.groups.keys())[308:]:
  count+=len(date_groups.get_group(each))
  group=date_groups.get_group(each)
  date=group.iloc[0].date_stamp[:-9]
  print(len(group),'\t',
        date,'\t',
        count,'\t\t\t',
        group.iloc[0].tweet_ID)
  group_data.append(dict(zip(np.array(['tweets','date','tweets_till_date']),
                             np.array([len(date_groups.get_group(each)),date,count]))))
  name="dated_subset/"+date
  group.to_csv(name+".csv")
  resetProgress(name+".json",len(group))
  u=getTweetsAsync(name+".csv",name+".json","dated_tweets/"+date+".csv")

group_df = pd.DataFrame(group_data)
group_df.to_csv('tweet_count_by_dates.csv',index=False)

In [None]:
 #@title Example: creating November and December 2020 subset

import pandas as pd

covid_subset.loc[5675514:5924812].to_csv("november.csv")
covid_subset.loc[5924813:6158088].to_csv("december.csv")

In [None]:
load_file = 'december.csv'
progress_file = 'download_progress.json'
save_file = 'covid_subset/dec_2020.csv'
size = 225874

In [None]:
#@title RUN this cell ONLY if csv file corrupt and cannot recover :(

!touch download_progress.json
resetProgress(progress_file, size)

In [None]:
getTweetsAsync(load_file, progress_file, save_file)

# Checking downloaded Tweets

In [None]:
df=pd.read_csv("covid_subset/dec_2020.csv", nrows=20500, delimiter=',', header=None, names=['1','0','tweet_ID', 'sentiment_category', 'keyword_used', 'date_stamp', 'full_text'], index_col='1');
df=df.drop(columns=['0'])

In [None]:
df.tail()

# Preprocessing

### removeLinks(string):
Removes hashtags, emails, urls and user mentions

In [None]:
def removeLinks(text):
  text = re.sub("\S*@\S*\s?","", text) #remove emails
  text = re.sub("@[A-Za-z0-9_]+","", text) #remove hashtag keyword
  text = re.sub("#[A-Za-z0-9_]+","", text) #remove mentioned users
  pattern = re.compile('(?:\\s)[^\\s\\.]*\\.[^\\s]+')
  text = pattern.sub('', text)
  pattern = re.compile('(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*')
  text = pattern.sub('', text)
  return text

### cleanTweets(DataFrame):
Returns cleaned, tokenized, concatenated sentences

In [None]:
import nltk
import re
import string

# execute only once for first time below three downloaders                
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words=stopwords.words('english')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Function to clean tweets: cleantweets(pandas.dataframe)
def cleantweets(df):
  sentences = list()
  df["full_text"] = df["full_text"].astype(str)
  lines = df["full_text"].values.tolist()
  for text in lines:
    #1. Convert into lower case
    text = text.lower()
    
    #2. Removing hashtags, emails, urls and user mentions
    text = removeLinks(text)
    #text = re.sub("@[A-Za-z0-9_]+","", text) #remove hashtag keyword
    #text = re.sub("#[A-Za-z0-9_]+","", text) #remove mentioned users
    #pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    #text = pattern.sub('', text)

    #3. Removing emojies
    emoji = re.compile("["
                        u"\U0001F600-\U0001FFFF"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    text = emoji.sub(r'', text)

    #4. Remove special characters
    text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)

    #5. Tokenize
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)

    #6. Removing punctuation
    stripped = [w.translate(table) for w in tokens]

    #7. Removing non-english characters
    res = [idx for idx in stripped if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", idx)]

    #8. Removing stop-words
    words = [w for w in res if not w in stop_words]
    words = ' '.join(words)
    sentences.append(words)
  return sentences


### Cleaning all

In [None]:
import pandas as pd
from pathlib import Path

load_dir = Path('dated_subset')
save_dir = 'cleaned_tweets/'

for item in load_dir.glob('**/2020-12-*'):
  if item.suffix in ['.csv','.xlsx']:
    #1. Load only text data
    date=pd.read_csv('dated_subset/'+item.name, 
                     usecols=['sentiment_category','emotion_category','full_text'], 
                     #header=None,
                     #usecols=[1,2,3,4],
                     index_col=[1],
                     lineterminator='\n')
    print(date.head())
    raise '1'
    #2. Remove duplicate rows
    date=date.drop_duplicates(subset=None, keep='first')
 
    #print('\n\n',week.head(5),'\n')
    print(len(date),'tweets for',item.name,':')
    print('\n\n',date.head(5),'\n\n')

    #4. Clean the dataset using cleantweets() function
    sentences=cleantweets(date)
    
    #5. Save cleaned data(sentences) to .txt format for summarization
    with open(save_dir+item.name[:-4]+'.txt', 'w', encoding="utf-8") as fp:
      for line in sentences:
        fp.write('%s.\n' %line)
    fp.close()
    print("\nSaved", save_dir+item.name[:-4]+'.txt', "successfully")
    

ValueError: ignored

# Summarization Functions

In [None]:
!pip -q install sumy

In [None]:
!pip -q install sumy --upgrade

In [None]:
#file save function
def save_summary(data,filepath):
  with open(filepath, 'w', encoding="utf-8") as fp:
    for line in data:
      fp.write('%s.\n' %line)
  fp.close()
  print("Saved", filepath, "successfully")


In [None]:
import pandas as pd
import time
from pathlib import Path

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import nltk
nltk.download('punkt')

from sumy.summarizers.lex_rank import LexRankSummarizer 
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
names = ["luhn",
         #"lsa",
         #"lexr",
         #"txtr",
         #"sb",
         #"kl"
         ]
summarizers = [LuhnSummarizer(),
               #LsaSummarizer(),
               #LexRankSummarizer(),
               #TextRankSummarizer(),
               #SumBasicSummarizer(),
               #KLSummarizer()
               ]

load_directory = Path('cleaned_tweets')
load_dir = 'cleaned_tweets/'
save_dir = 'summaries/'

sentences=10
summ_time=[]
summaries=[]

for item in load_directory.glob('**/2020-12-*'):
  if item.suffix in ['.txt'] and item.name[0] != '.':
    #1. Load single set
    print("Loaded",load_dir+item.name)
    parser = PlaintextParser.from_file(load_dir+item.name, Tokenizer("english"))
    
    #2. Generate Extractive summaries
    summary=[]
    for i in range(len(names)):
      print("summarize",item.name,"? y/n ")
      if input() == 'y':
        beg_time = time.time()
        each_algo = summarizers[i](parser.document, sentences)
        summ_time.append(time.time()-beg_time)
        
        #3. Save all summaries as .json or .txt
        save_summary(each_algo,save_dir+item.name[:-4]+'_'+names[i]+'.txt')
        summary.append(each_algo)

    summaries.append(summary)

print("Time Taken For Summarization:")
print(summ_time)

Time Taken For Summarization:
[]


# Summarization report of neutral tweets

# Summarization report of all tweets