In [1]:
#import basic libraries
import sklearn
import numpy as np
import pandas as pd
import re

#Importing stopwords from nltk library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from textblob import TextBlob
import collections

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeanette/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Part 1: Data Cleaning**

### Stopwords Removal

In [2]:
STOPWORDS = set(stopwords.words('english'))
def stopwords_f(text):
  return " ".join([word for word in str(text).split() if word not in STOPWORDS])

### Emoji and Emoticon Removal

In [3]:
def remove_emoji(string):
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"  # emoticons
                                  u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                  u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                  u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                  u"\U00002702-\U000027B0"
                                  u"\U000024C2-\U0001F251""]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [4]:
# function for removing emoticons
!pip install emot
import emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
def remove_emoticons(text):
         emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
         return emoticon_pattern.sub(r'', text)



### Loop for all the CSV

In [5]:
filenames = ['noplastic_updated.csv', 'plasticpollutes_updated.csv', 'plasticpollution1.csv', 'plasticpollution2_updated.csv', 'sustainability_updated.csv', 'zerowaste_updated.csv' ]
for f in filenames:
      df = pd.read_csv(f'./hashtags/{f}')
      #df = df [['created_at', 'full_text', 'user']]
      df = df[df['lang'] == 'en']

      df["text_lower"] = df['full_text'].str.lower()
      df['text_punct'] = df ['text_lower'].str.replace('[^\w\s]','')
     
      df["text_stop"] = df["text_punct"].apply(stopwords_f)
      df['text_stop'] = df['text_stop'].apply(remove_emoji)
      
      df['text_stop'] = df['text_stop'].apply(remove_emoticons)

      df.to_csv('./hashtags/cleaned_{}'.format(f))

# **Part 2: Sentiment Analysis**

### Function needed for file analysis

In [6]:
def clean_tweet(tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split())

def get_tweet_sentiment(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'

def get_tweet_objective(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's subjectivity method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.subjectivity > 0.5: 
        return 'subjective'
    else: 
        return 'objective'

def get_ratio(tweet, file):
    ''' 
    Utility function to calculate the percentage of the tag
    provided
    '''
    return {
        'file': file.split('.')[0],
        'positive %': tweet[tweet == 'positive'].count()/tweet.count() * 100,
        'negative %': tweet[tweet == 'negative'].count()/tweet.count() * 100,
        'ratio [positive:negative] %': tweet[tweet == 'positive'].count()/tweet[tweet == 'negative'].count() * 100}

In [7]:
def clean_hashtag(tweet):
    lists = ["#plasticpollutes",
            "#plasticpollution",
            "#noplastic",
            "#sustainability",
            "#zerowaste",
            '#plastic',
            '#banplastic',
            '#ZeroWaste',
            '#FeedtheHungry',
            '# ecofriendly']
    for lis in lists:
        tweet = tweet.replace(f'{lis}', '')
    return tweet

In [8]:
import nltk
nltk.download('punkt')

def get_bigrams(text):
    nltk_tokens = nltk.word_tokenize(text)
    return (list(nltk.bigrams(nltk_tokens)))

def get_trigrams(text):
    nltk_tokens = nltk.word_tokenize(text)
    return (list(nltk.trigrams(nltk_tokens)))

def flatten_list(series):
    slist =[]
    for x in series:
        slist.extend(x)
    return slist

[nltk_data] Downloading package punkt to /Users/jeanette/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Loop through all the files, calculate the positive, negative percentage and ratio based on files
* Ngrams is the dataframe of all the file ngram results
* results is the dataframe of all the file sentiment analysis

In [9]:
files = ['cleaned_noplastic_updated.csv', 'cleaned_plasticpollutes_updated.csv', 'cleaned_plasticpollution1.csv', 
         'cleaned_plasticpollution2_updated.csv', 'cleaned_sustainability_updated.csv', 'cleaned_zerowaste_updated.csv' ]
results = pd.DataFrame()
ngrams = pd.DataFrame([_ for _ in range(50)])
for file in files:
    # read file
    print(f"------{file}------")
    df = pd.read_csv(f'./hashtags/{file}')
    
    # get full_text and clean up
    dff = df['full_text'].to_frame()
    dff['clean'] = dff['full_text'].apply(lambda x: (clean_tweet(x)))
    
    # do the analysis on sentiment
    dff['sentiment'] = dff['clean'].apply(lambda x: get_tweet_sentiment((x)))
    dff['subjectivity'] = dff['clean'].apply(lambda x: get_tweet_objective((x)))
    
    # calculate the ratio and keep the dataframe together
    series = dff['sentiment']
    result = get_ratio(series, file)
    results = results.append(result, ignore_index=True)
    
    # add n_grams and get the most_common words
    print("---- bigrams ----")
    dff['bi_grams'] = dff['clean'].apply(lambda x: get_bigrams(x))
    lis = flatten_list(dff['bi_grams'])
    c = collections.Counter(lis)
    a = pd.DataFrame(c.most_common()[:50], columns=[f'{file}_bigrams', f'{file}_bi_occurancy'])
    df = pd.concat([df, a], axis=1)
    
    print(c.most_common(10))
    print("---- trigrams ----")
    dff['tri_grams'] = dff['clean'].apply(lambda x: get_trigrams(x))
    lis = flatten_list(dff['tri_grams'])
    c = collections.Counter(lis)
    a = pd.DataFrame(c.most_common()[:50], columns=[f'{file}_trigrams', f'{file}_tri_occurancy'])
    ngrams = pd.concat([ngrams, a], axis=1)
    print(c.most_common(10))
    

    del df, dff

------cleaned_noplastic_updated.csv------
---- bigrams ----
[(('zerowaste', 'noplastic'), 9), (('noplastic', 'plasticfree'), 9), (('is', 'a'), 9), (('in', 'our'), 9), (('sustainableliving', 'noplastic'), 8), (('plasticfree', 'noplastic'), 8), (('ClimateAction', 'ClimateEmergency'), 7), (('Zero', 'Waste'), 7), (('of', 'the'), 6), (('plasticfree', 'zerowaste'), 6)]
---- trigrams ----
[(('sustainableliving', 'noplastic', 'ecology'), 6), (('environment', 'Dublin', 'Ireland'), 6), (('noplastic', 'ecology', 'choosetoreuse'), 5), (('ecology', 'choosetoreuse', 'environment'), 5), (('choosetoreuse', 'environment', 'Dublin'), 5), (('woodencutlery', 'recycle', 'reuse'), 5), (('ecofriendly', 'zerowaste', 'sustainable'), 5), (('zerowaste', 'sustainable', 'nature'), 5), (('sustainable', 'nature', 'green'), 5), (('nature', 'green', 'sustainability'), 5)]
------cleaned_plasticpollutes_updated.csv------
---- bigrams ----
[(('activist', 'activism'), 8), (('activism', 'plastic'), 8), (('plastic', 'endpla

In [10]:
results

Unnamed: 0,file,negative %,positive %,ratio [positive:negative] %
0,cleaned_noplastic_updated,18.285714,51.428571,281.25
1,cleaned_plasticpollutes_updated,16.129032,53.225806,330.0
2,cleaned_plasticpollution1,20.44335,54.285714,265.542169
3,cleaned_plasticpollution2_updated,16.611296,58.72093,353.5
4,cleaned_sustainability_updated,10.680674,60.494638,566.393443
5,cleaned_zerowaste_updated,18.181818,65.151515,358.333333


In [11]:
ngrams

Unnamed: 0,0,cleaned_noplastic_updated.csv_trigrams,cleaned_noplastic_updated.csv_tri_occurancy,cleaned_plasticpollutes_updated.csv_trigrams,cleaned_plasticpollutes_updated.csv_tri_occurancy,cleaned_plasticpollution1.csv_trigrams,cleaned_plasticpollution1.csv_tri_occurancy,cleaned_plasticpollution2_updated.csv_trigrams,cleaned_plasticpollution2_updated.csv_tri_occurancy,cleaned_sustainability_updated.csv_trigrams,cleaned_sustainability_updated.csv_tri_occurancy,cleaned_zerowaste_updated.csv_trigrams,cleaned_zerowaste_updated.csv_tri_occurancy
0,0,"(sustainableliving, noplastic, ecology)",6,"(activist, activism, plastic)",8,"(to, create, a)",284,"(create, a, new)",257,"(TriviaTuesday, BeInNature, JustBreathe)",102,"(100, Vegan, Use)",5
1,1,"(environment, Dublin, Ireland)",6,"(activism, plastic, endplasticpollution)",8,"(is, trying, to)",284,"(the, plastic, industry)",257,"(BeInNature, JustBreathe, Sustainability)",100,"(Vegan, Use, PROMO)",5
2,2,"(noplastic, ecology, choosetoreuse)",5,"(plastic, endplasticpollution, breakfreefrompl...",8,"(create, a, new)",283,"(Europeans, called, on)",256,"(JustBreathe, Sustainability, Quiz)",79,"(Use, PROMO, code)",5
3,3,"(ecology, choosetoreuse, environment)",5,"(endplasticpollution, breakfreefromplastic, pl...",8,"(a, new, law)",283,"(to, create, a)",256,"(B, Oceans, TriviaTuesday)",75,"(PROMO, code, TW25)",5
4,4,"(choosetoreuse, environment, Dublin)",5,"(breakfreefromplastic, plastickills, notdispos...",8,"(new, law, to)",283,"(a, new, law)",256,"(Oceans, TriviaTuesday, BeInNature)",75,"(code, TW25, for)",5
5,5,"(woodencutlery, recycle, reuse)",5,"(plastickills, notdisposable, justrecovery)",8,"(law, to, ban)",283,"(new, law, to)",256,"(Sustainability, Quiz, TriviaTuesday)",65,"(TW25, for, 25)",5
6,6,"(ecofriendly, zerowaste, sustainable)",5,"(notdisposable, justrecovery, plasticpollutes)",8,"(the, plastic, industry)",283,"(law, to, ban)",256,"(sustainability, AI, 5G)",62,"(for, 25, off)",5
7,7,"(zerowaste, sustainable, nature)",5,"(justrecovery, plasticpollutes, plasticisoil)",8,"(Europeans, called, on)",282,"(to, ban, SingleUsePlastics)",256,"(AI, 5G, cloud)",57,"(25, off, Show)",5
8,8,"(sustainable, nature, green)",5,"(plasticpollutes, plasticisoil, plasticisclima...",6,"(to, ban, SingleUsePlastics)",282,"(plastic, industry, is)",256,"(5G, cloud, edge)",56,"(off, Show, now)",5
9,9,"(nature, green, sustainability)",5,"(plasticisoil, plasticisclimatechange, plastic...",4,"(plastic, industry, is)",282,"(to, water, down)",256,"(IoT, sustainability, AI)",50,"(Show, now, plasticfreejuly)",5
