### Function needed for file analysis

In [211]:
import re


def clean_tweet(tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split())

def get_tweet_sentiment(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'

def get_tweet_objective(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's subjectivity method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.subjectivity > 0.5: 
        return 'subjective'
    else: 
        return 'objective'

def get_ratio(tweet, file):
    ''' 
    Utility function to calculate the percentage of the tag
    provided
    '''
    return {
        'file': file.split('.')[0],
        'positive %': (tweet == 'positive').mean() * 100,
        'netative %': (tweet == 'negative').mean() * 100,
        'ratie [positive:negative] %': (tweet == 'positive').sum()/(tweet == 'negative').sum() }

In [199]:
import nltk
nltk.download('punkt')

def get_bigrams(text):
    nltk_tokens = nltk.word_tokenize(text)
    return (list(nltk.bigrams(nltk_tokens)))

def get_trigrams(text):
    nltk_tokens = nltk.word_tokenize(text)
    return (list(nltk.trigrams(nltk_tokens)))

def flatten_list(series):
    slist =[]
    for x in series:
        slist.extend(x)
    return slist

[nltk_data] Downloading package punkt to /Users/chen_zeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loop through all the files, calculate the positive, negative percentage and ratio based on files

- Ngrams is the dataframe of all the file ngram results
- results is the dataframe of all the file sentiment analysis

In [235]:
import pandas as pd
from textblob import TextBlob
import collections


files = ['noplastic.csv', 'plasticpollutes.csv', 'plasticpollution1.csv', 'plasticpollution2.csv',
        'sustainability.csv', 'zerowaste.csv']
results = pd.DataFrame()
ngrams = pd.DataFrame([_ for _ in range(50)])
for file in files:
    # read file
    print(f"------{file}------")
    df = pd.read_csv(f'./hashtags/{file}')
    
    # get full_text and clean up
    dff = df['full_text'].to_frame()
    dff['clean'] = dff['full_text'].apply(lambda x: (clean_tweet(x)))
    
    # do the analysis on sentiment
    dff['sentiment'] = dff['clean'].apply(lambda x: get_tweet_sentiment((x)))
    dff['subjectivity'] = dff['clean'].apply(lambda x: get_tweet_objective((x)))
    
    # calculate the ratio and keep the dataframe together
    series = dff['sentiment']
    result = get_ratio(series, file)
    results = results.append(result, ignore_index=True)
    
    # add n_grams and get the most_common words
    print("---- bigrams ----")
    dff['bi_grams'] = dff['clean'].apply(lambda x: get_bigrams(x))
    lis = flatten_list(dff['bi_grams'])
    c = collections.Counter(lis)
    a = pd.DataFrame(c.most_common()[:50], columns=[f'{file}_bigrams', f'{file}_bi_occurancy'])
    df = pd.concat([df, a], axis=1)
    
    print(c.most_common(10))
    print("---- trigrams ----")
    dff['tri_grams'] = dff['clean'].apply(lambda x: get_trigrams(x))
    lis = flatten_list(dff['tri_grams'])
    c = collections.Counter(lis)
    a = pd.DataFrame(c.most_common()[:50], columns=[f'{file}_trigrams', f'{file}_tri_occurancy'])
    ngrams = pd.concat([ngrams, a], axis=1)
    print(c.most_common(10))
    

    del df, dff

------noplastic.csv------
---- bigrams ----
[(('EstoNOtienequePARAR', 'SalimosDeEsta'), 39), (('SalimosDeEsta', 'Informacion'), 39), (('Informacion', 'zocoup'), 39), (('zocoup', 'hechoamano'), 39), (('hechoamano', 'artesanal'), 39), (('artesanal', 'noplastic'), 39), (('noplastic', 'economiasostenible'), 39), (('economiasostenible', 'sinplasticos'), 39), (('plasticfree', 'plasticpollution'), 30), (('ecofriendly', 'pollution'), 30)]
---- trigrams ----
[(('EstoNOtienequePARAR', 'SalimosDeEsta', 'Informacion'), 39), (('SalimosDeEsta', 'Informacion', 'zocoup'), 39), (('Informacion', 'zocoup', 'hechoamano'), 39), (('zocoup', 'hechoamano', 'artesanal'), 39), (('hechoamano', 'artesanal', 'noplastic'), 39), (('artesanal', 'noplastic', 'economiasostenible'), 39), (('noplastic', 'economiasostenible', 'sinplasticos'), 39), (('ecofriendly', 'eco', 'ecofriendlyliving'), 28), (('eco', 'ecofriendlyliving', 'pollutionfree'), 27), (('ecofriendlyliving', 'pollutionfree', 'plasticfree'), 27)]
------plasti

In [236]:
ngrams

Unnamed: 0,0,noplastic.csv_trigrams,noplastic.csv_tri_occurancy,plasticpollutes.csv_trigrams,plasticpollutes.csv_tri_occurancy,plasticpollution1.csv_trigrams,plasticpollution1.csv_tri_occurancy,plasticpollution2.csv_trigrams,plasticpollution2.csv_tri_occurancy,sustainability.csv_trigrams,sustainability.csv_tri_occurancy,zerowaste.csv_trigrams,zerowaste.csv_tri_occurancy
0,0,"(EstoNOtienequePARAR, SalimosDeEsta, Informacion)",39,"(activist, activism, plastic)",8,"(to, create, a)",284,"(create, a, new)",257,"(TriviaTuesday, BeInNature, JustBreathe)",116,"(wajibpakaikantongbelanjaramahlingkungan, jakartasehat, KBRL)",38
1,1,"(SalimosDeEsta, Informacion, zocoup)",39,"(activism, plastic, endplasticpollution)",8,"(is, trying, to)",284,"(the, plastic, industry)",257,"(BeInNature, JustBreathe, Sustainability)",114,"(jakartasehat, KBRL, kantongbelanjaramahlingkungan)",38
2,2,"(Informacion, zocoup, hechoamano)",39,"(plastic, endplasticpollution, breakfreefromplastic)",8,"(create, a, new)",283,"(Europeans, called, on)",256,"(JustBreathe, Sustainability, Quiz)",86,"(KBRL, kantongbelanjaramahlingkungan, budidayamaggot)",38
3,3,"(zocoup, hechoamano, artesanal)",39,"(endplasticpollution, breakfreefromplastic, plastickills)",8,"(a, new, law)",283,"(to, create, a)",256,"(B, Oceans, TriviaTuesday)",86,"(kantongbelanjaramahlingkungan, budidayamaggot, zerowaste)",38
4,4,"(hechoamano, artesanal, noplastic)",39,"(breakfreefromplastic, plastickills, notdisposable)",8,"(new, law, to)",283,"(a, new, law)",256,"(Oceans, TriviaTuesday, BeInNature)",86,"(budidayamaggot, zerowaste, sedekahsampah)",38
5,5,"(artesanal, noplastic, economiasostenible)",39,"(plastickills, notdisposable, justrecovery)",8,"(law, to, ban)",283,"(new, law, to)",256,"(Sustainability, Quiz, TriviaTuesday)",72,"(zerowaste, sedekahsampah, minyakjelantah)",38
6,6,"(noplastic, economiasostenible, sinplasticos)",39,"(notdisposable, justrecovery, plasticpollutes)",8,"(the, plastic, industry)",283,"(law, to, ban)",256,"(sustainability, AI, 5G)",62,"(sedekahsampah, minyakjelantah, ketahananpangan)",38
7,7,"(ecofriendly, eco, ecofriendlyliving)",28,"(justrecovery, plasticpollutes, plasticisoil)",8,"(Europeans, called, on)",282,"(to, ban, SingleUsePlastics)",256,"(AI, 5G, cloud)",57,"(minyakjelantah, ketahananpangan, dietplastik)",38
8,8,"(eco, ecofriendlyliving, pollutionfree)",27,"(plasticpollutes, plasticisoil, plasticisclimatechange)",6,"(to, ban, SingleUsePlastics)",282,"(plastic, industry, is)",256,"(5G, cloud, edge)",56,"(ketahananpangan, dietplastik, jakartatangguh)",38
9,9,"(ecofriendlyliving, pollutionfree, plasticfree)",27,"(plasticisoil, plasticisclimatechange, plasticisfracking)",4,"(plastic, industry, is)",282,"(to, water, down)",256,"(Quiz, TriviaTuesday, ContestAlert)",52,"(dietplastik, jakartatangguh, jakartabebassampahplastik)",38


In [237]:
results

Unnamed: 0,file,netative %,positive %,ratie [positive:negative] %
0,noplastic,12.615385,31.692308,2.512195
1,plasticpollutes,15.625,51.5625,3.3
2,plasticpollution1,17.914214,46.635828,2.603286
3,plasticpollution2,14.570007,50.390903,3.458537
4,sustainability,10.14,56.22,5.544379
5,zerowaste,13.907285,44.039735,3.166667
