Content
1. [Scraping a dataset from Nitter based on the hashtag #animalwelfare](#1.-Scraping-a-dataset-from-Nitter-based-on-the-hashtag-#animalwelfare)
2. [Finding unique tweeters and scraping their profile info](#2.-Finding-unique-tweeters-and-scraping-their-profile-info)
3. [Basic attributes about tweets and tweeters](#3.-Basic-attributes-about-tweets-and-tweeters)
4. [Analyzing manually annotated lists of users](#4.-Analyzing-manually-annotated-lists-of-users)
5. [Wordclouds](#5.-Wordclouds)
6. [Wordcount](#6.-Wordcount)
7. [Boxplots and dotplots](7.-Boxplots-and-dotplots)
8. [Hashtags of selected accounts](#8.-Hashtags-of-selected-accounts)
9. [Text column cleanup](#9.-Text-column-cleanup)
10. [Clustering of tweets using BERT](#10.-Clustering-of-tweets-using-BERT)
11. [Automatic categorizing using BERT](#11.-Automatic-categorizing-using-BERT)

Main questions behind exploratory analysis:

How to analyze the discussion on animalwelfare on twitter through Nitter? How to structure the information regarding animalwelfare on twitter? Do easy fixes exist: can hashtags be used to gain an  overview on the animalwelfare discussion in twitter? Can subtopics be automatically identified? Can discussion participants be automatically classified? How diverse is the community in their information generation?

In [None]:
### Overview of packages used

##

##1. Scraping a dataset from Nitter based on the hashtag #animalwelfare

In [None]:
### initialize the scraper

import pandas as pd
from ntscraper import Nitter

scraper = Nitter(log_level=1, skip_instance_check=False)

In [None]:
### do the actual scraping

aw_hash_tweets = scraper.get_tweets("animalwelfare", mode='hashtag', number=5000, since='2023-10-01')

In [None]:
### extract necessary info

final_tweets = []

for tweet in aw_hash_tweets['tweets']:
    data = [tweet['link'], tweet['text'], tweet['user']['username'], tweet['user']['profile_id'], tweet['date'], tweet['stats']['retweets'], tweet['stats']['likes'], tweet['stats']['comments'], tweet['stats']['quotes']]
    final_tweets.append(data)


In [None]:
### create pandas dataframe with info

data = pd.DataFrame(final_tweets, columns = ['link', 'text', 'username', 'profile_id', 'date', 'retweets', 'likes', 'comments', 'quotes'])
data

In [None]:
data['total_interactions'] = data['retweets'] + data['likes'] + data['comments'] + data['quotes']

In [None]:
### clean-up bio's column to make better excel readable

data['text'] = data['text'].str.replace(r'\n',' ', regex=True)

In [None]:
data.to_csv('aw_global_csv.csv')

In [None]:
### load data again

import pandas as pd

data = pd.read_csv('aw_global_csv.csv')

##

##2. Finding unique tweeters and scraping their profile info

In [None]:
###find unique tweeters

import numpy as np
import matplotlib.pyplot as plt

data_sub1 = pd.DataFrame(data.groupby('username')['total_interactions'].sum(), columns = ['total_interactions'])
data_sub2 = pd.DataFrame(data.groupby('username')['link'].nunique(), columns=['link'])

df_tweeters = pd.merge(data_sub2, data_sub1, how='inner', on='username')

In [None]:
len(df_tweeters)

In [None]:
final_tweeters = df_tweeters.sort_values(by=['total_interactions'])
final_tweeters = final_tweeters.tail(400)

In [None]:
final_profiles = []

#for i in range(len(new3)):
#    pr-info = new3.loc[i, "username"]

for ind in final_tweeters.index:
#    pr_info = new7['username'][ind]
    pr_info = ind
    try:
        profile = scraper.get_profile_info(pr_info)
    except:
        data2 = ['NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN']
    try:
        data2 = [profile['username'], profile['bio'], profile['joined'], profile['stats']['tweets'], profile['stats']['following'], profile['stats']['followers'], profile['stats']['likes'], profile['stats']['media']]
    except:
        data2 = ['NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN']
    final_profiles.append(data2)


In [None]:
data2

In [None]:
### save results

data2['bio'] = data2['bio'].str.replace(r'\n',' ', regex=True)
data2.to_csv('aw_users_csv.csv')

In [None]:
### load results

import pandas as pd

data2 = pd.read_csv('aw_users_csv.csv')

##

##3. Basic attributes about tweets and tweeters

In [None]:
import matplotlib as plt

In [None]:
### 20 posts with most interactions

df_topposts = data.sort_values(by=['total_interactions'])
df_topposts.tail(20).to_csv('aw_top20posts_csv.csv')

In [None]:
# show how many tweets had interactions

import numpy as np
import matplotlib.pyplot as plt

A = data.total_interactions.isin([0]).sum()
B = 5000 - A

y = np.array([A, B])
mylabels = ["Tweets without interactions", "Tweets with interactions"]

plt.pie(y, labels = mylabels)
plt.show() 

In [None]:
# how many tweets contained the word China, trade, dog, cat, farming

china = [s for s in data.text if "China" in s]
trade = [s for s in data.text if "trade" in s]
dog = [s for s in data.text if "dog" in s]
cat = [s for s in data.text if "cat" in s]
farming = [s for s in data.text if "farming" in s]

print(len(china), len(trade), len(dog), len(cat), len(farming))

In [None]:
### select tweets with "#animalwelfare" and "China" or "@China"

China = pd.DataFrame(china, columns=['text'])
China.to_csv('chinatweets.csv')

In [None]:
### show interactions per tweet

hist = data.total_interactions.hist(bins=100)

In [None]:
### show correlation between likes and retweets

data.plot.scatter(x = 'Retweets', y = 'Likes', s = 100, c='blue')

In [None]:
### show how many unique users had interactions

import numpy as np
import matplotlib.pyplot as plt

A = data_sub1.total_interactions.isin([0]).sum()
B = 5000 - A

y = np.array([A, B])
mylabels = ["Users without interactions", "Users with interactions"]

plt.pie(y, labels = mylabels)
plt.show() 

In [None]:
### how often did single users tweet

hist = data_sub2.hist(bins=100)

In [None]:
### who where the most liked users

temp1 = data_sub1.sort_values(by=['total_interactions'])
temp1.tail(10)

In [None]:
### How many interactions did the top 30 users receive

C = temp1.total_interactions.tail(30).sum() / temp1.total_interactions.sum()

##

##4. Scrap tweets per user per month

In [None]:
import matplotlib as plt
import numpy as np

In [None]:
# get activity of users for a given time period with ntscrapper

###
final_september = []

for ind in data2.username:
    user_name = ind
    try:
        tweets_september = scraper.get_tweets(user_name, mode='user', since='2023-09-01', until='2023-10-01')
        for tweet in tweets_september['tweets']:
            data = [tweet['link'], tweet['text'], tweet['user']['username'], tweet['user']['profile_id'], tweet['date'], tweet['stats']['retweets'], tweet['stats']['likes'], tweet['stats']['comments'], tweet['stats']['quotes']]
            final_september.append(data)
    except:
        data = ['NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN']
        final_september.append(data)
        
september = pd.DataFrame(final_september, columns = ['link', 'text', 'username', 'profile_id', 'date', 'retweets', 'likes', 'comments', 'quotes'])

In [None]:
### save

september['text'] = september['text'].str.replace(r'\n',' ', regex=True)
september.to_csv('aw_september_csv.csv')

In [None]:
### load

september = pd.read_csv('aw_september_csv.csv')

In [None]:
# how often did single users tweet

sep2 = september.groupby('username')['link'].nunique()
hist = sep2.hist(bins=100)

In [None]:
for ind in data2.username:
    user_name = ind
    try:
        tweets_october = scraper.get_tweets(user_name, mode='user', since='2023-10-01', until='2023-11-01')
        for tweet in tweets_october['tweets']:
            data = [tweet['link'], tweet['text'], tweet['user']['username'], tweet['user']['profile_id'], tweet['date'], tweet['stats']['retweets'], tweet['stats']['likes'], tweet['stats']['comments'], tweet['stats']['quotes']]
            final_october.append(data)
    except:
        data = ['NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN']
        final_october.append(data)
        
october = pd.DataFrame(final_october, columns = ['link', 'text', 'username', 'profile_id', 'date', 'retweets', 'likes', 'comments', 'quotes'])


In [None]:
### save

october.to_csv('aw_october_csv.csv')

In [None]:
# how often did single users tweet

oct2 = october.groupby('username')['link'].nunique()
hist = oct2.hist(bins=100)

In [None]:
#sep2
sep2 = pd.DataFrame(sep2)
oct2 = pd.DataFrame(oct2)
#new11 = pd.merge(new9, sep2, left_on=["col0", "col1"], right_index=True, how="right")
temp2 = pd.merge(data2, sep2, how='inner', on='username')
temp2.rename(columns={'link': 'september_tweets'}, inplace=True)

temp3 = pd.merge(temp2, oct2, how='inner', on='username')
temp3.rename(columns={'link': 'october_tweets'}, inplace=True)

temp4 = temp3.sort_values('total_interactions')
#new9 = new8.tail(400)
temp5 = temp4.tail(20)

In [None]:
### scatter monthly tweeting activity vs total interactions

ax1 = temp4.plot(kind='scatter', x='september_tweets', y='total_interactions', color='r', label="september")    
ax2 = temp4.plot(kind='scatter', x='october_tweets', y='total_interactions', color='g', label="october", ax=ax1) 

ax1.set_xlabel("monthly tweets")
ax1.set_ylabel("total interactions")
plt.show()

In [None]:
temp6 = data2[['username', 'followers']]
temp7 = pd.merge(temp4, temp6, how='left', on='username')
temp8 = temp7.sort_values('total_interactions')
temp9 = temp8.tail(20)

In [None]:
temp9

In [None]:
### scatter total_interactions vs followers

ax1 = temp8.plot(kind='scatter', x='followers', y='total_interactions', color='b')    

ax1.set_xlabel("followers")
ax1.set_ylabel("total interactions")
plt.show()

##

##4. Analyzing manually annotated lists of users

In [None]:
import pandas as pd
import matplotlib as plt

In [None]:
annotated = pd.read_csv('annotated_users_interactions.csv', sep=';')

In [None]:
annotated

In [None]:
df_annotated = pd.merge(annotated, data_sub2, how='inner', on='username')
df_annotated['ratio'] = df_annotated['total_interactions'] / df_annotated['link']
#df_annotated2 = df_annotated.sort_values(by=['ratio'])
df_annotated['username'] = df_annotated['username'].str.replace('@', '')
col_list = df_annotated.username.values.tolist()
print(col_list)

In [None]:
follnum_annotataed = []

#for i in range(len(new3)):
#    pr-info = new3.loc[i, "username"]

#for ind in df_annotated.username:
#    pr_info = new7['username'][ind]
#    pr_info = df_annotated['username'][ind]
#    profile = scraper.get_profile_info(pr_info)
for i in col_list:
    try: 
        profile = scraper.get_profile_info(i)
    except:
        data2 = ['NaN', 'NaN', 'NaN', 'NaN', 'NaN']
    try:
        data2 = [profile['stats']['tweets'], profile['stats']['following'], profile['stats']['followers'], profile['stats']['likes'], profile['stats']['media']]
    except:
        data2 = ['NaN', 'NaN', 'NaN', 'NaN', 'NaN']
    follnum_annotataed.append(data2)

data4 = pd.DataFrame(follnum_annotataed, columns = ['tweets', 'following', 'followers', 'likes', 'media'])

In [None]:
#data3

temp10 = pd.DataFrame(col_list, columns = ['username'])
df_annotated2 = pd.merge(temp10, data4, left_index=True, right_index=True)
df_annotated3 = pd.merge(df_annotated2, df_annotated, how='inner', on='username') 
df_annotated3 = df_annotated3.sort_values(by=['ratio'])
print(df_annotated3[['username', 'ratio', 'followers', 'tweets']].tail(20))

In [None]:
ax1 = df_annotated3.plot(kind='scatter', x='followers', y='ratio', color='r', label="followers")    
ax2 = df_annotated3.plot(kind='scatter', x='tweets', y='ratio', color='g', label="tweets", ax=ax1) 

ax1.set_xlabel("number of tweets or of followers")
ax1.set_ylabel("ratio interactions / #animalwelfare tweets")
plt.show()

In [None]:
hist = df_annotated3.ratio.hist(bins=100)

In [None]:
hist = df_annotated3.tweets.hist(bins=100)

In [None]:
temp11 = df_annotated3.loc[df_annotated3['total_interactions'] <= 3 ]
hist = temp11.followers.hist(bins=100)

In [None]:
temp11[['followers']].mean()

In [None]:
df_annotated3[['tweets']].median()

In [None]:
follnum_annotated = []

#for i in range(len(new3)):
#    pr-info = new3.loc[i, "username"]

#for ind in df_annotated.username:
#    pr_info = new7['username'][ind]
#    pr_info = df_annotated['username'][ind]
#    profile = scraper.get_profile_info(pr_info)
for i in col_list:
    try: 
        profile = scraper.get_profile_info(i)
    except:
        data2 = ['NaN']
    try:
        data2 = [profile['bio']]
    except:
        data2 = ['NaN']
    follnum_annotated.append(data2)

data5 = pd.DataFrame(follnum_annotated, columns = ['bio'])

In [None]:
temp12 = data[['username', 'profile_id']]
temp13 = temp12.drop_duplicates()
temp13['username'] = temp13['username'].str.replace('@', '')
print(temp13)

In [None]:
#new50 = pd.DataFrame(col_list, columns = ['username'])
df_annotated4 = pd.merge(temp10, data5, left_index=True, right_index=True)
df_annotated5 = pd.merge(df_annotated3, temp13, how='inner', on='username') 
df_annotated6 = pd.merge(df_annotated5, df_annotated4, how='inner',on='username') 

In [None]:
df_annotated6.to_csv('annotated_with_info.csv')

##

##5. Wordclouds

In [None]:
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import json
import string
import pandas as pd
import matplotlib as plt

In [None]:
single = df_annotated4['bio'].to_string(index=False)

print(len(single))

In [None]:
### to use for category
#df_annotated8 = df_annotated7.loc[df_annotated7['Type'] == 'NGO']
#single = df_annotated8['bio'].to_string(index=False)

In [None]:
lemmatizer= WordNetLemmatizer()

#extra_stop_filter2 = ['’', "'s", "n't", 'promotion', 'collected', 'comfortable', 'part', "'ve", 'shoe']
stop_filters = stopwords.words('english') + list(string.punctuation) 
#+ extra_stop_filter2

best_comf_tokens = [lemmatizer.lemmatize(tokens) for tokens in word_tokenize(single)\
                    if tokens not in stop_filters]

In [None]:
best_comf_words_filtered = single.replace('based', "").replace('music', "").replace('running', "")
#best_comf_words_filtered = single

In [None]:
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(best_comf_words_filtered)
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Most used Words in annotated users' Bios",fontsize= 20)
plt.show()

##

##6. Wordcount

In [None]:
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

In [None]:
df_annotated6 = pd.read_csv('annotated_with_info.csv')
categories = pd.read_csv('categories.csv', sep=';')
categories2 = categories.drop(columns=['total_interactions'])
categories2['username'] = categories2['username'].str.replace('@', '')
df_annotated7 = pd.merge(df_annotated6, categories2, how='left', on='username')

In [None]:
stop = stopwords.words('english')

df_annotated7['bio'] = df_annotated7['bio'].astype(str) 

df_annotated7['bio_without_stopwords'] = df_annotated7['bio'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#print(Act4AnimalsEU)
df_annotated7['bio_without_stopwords'] = df_annotated7['bio_without_stopwords'].str.replace('nan','').replace('&','').replace('-','')

wl = Counter(" ".join(df_annotated7["bio_without_stopwords"].str.lower()).split()).most_common(100)

#data = np.asarray(my_list)

data = np.asarray(wl)
a = data.tolist()
a = pd.DataFrame(a, columns=['word', 'frequency'])
a.head(20)

In [None]:
stop = stopwords.words('english')

df_annotated8['bio'] = df_annotated8['bio'].astype(str) 

df_annotated8['bio_without_stopwords'] = df_annotated8['bio'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#print(Act4AnimalsEU)
df_annotated8['bio_without_stopwords'] = df_annotated8['bio_without_stopwords'].str.replace('nan','').replace('&','').replace('-','')

wl = Counter(" ".join(df_annotated8["bio_without_stopwords"].str.lower()).split()).most_common(100)

#data = np.asarray(my_list)

data = np.asarray(wl)
a = data.tolist()
a = pd.DataFrame(a, columns=['word', 'frequency'])
a.head(20)

##

##7. Boxplots and dotplots

In [None]:
import pandas as pd

df_cc = pd.read_csv('Boxplot.csv', sep = ';')

In [None]:
#df_cc.ratio.isna().sum()
df_cc

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import from_levels_and_colors
#colours = [mcolors.darkviolet, mcolors.deepskyblue, mcolors.yellow, mcolors.darkgreen, mcolors.hotpink]
#colours = ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen']
colours = {1:'darkviolet', 2:'deepskyblue', 3:'yellow', 4:'darkgreen', 5:'hotpink', 6:'black', 7:'orange', 8:'dimgray', 9:'red', 10:'paleturquoise', 11:'mediumspringgreen'}
col_list = df_cc.Coded.values.tolist()
converted_colors = [colours[color] for color in col_list]
#cmap, norm = from_levels_and_colors([1,2,3,4,5,6,7,8,9,10,11], ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen'])
#plt.scatter(df_cc.Coded, df_cc.total_interactions, s=200, c=df_cc.Coded, cmap=cmap)
plt.scatter(df_cc.Coded, df_cc.total_interactions, c=converted_colors)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import from_levels_and_colors
#colours = [mcolors.darkviolet, mcolors.deepskyblue, mcolors.yellow, mcolors.darkgreen, mcolors.hotpink]
#colours = ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen']
colours = {1:'darkviolet', 2:'deepskyblue', 3:'yellow', 4:'darkgreen', 5:'hotpink', 6:'black', 7:'orange', 8:'dimgray', 9:'red', 10:'paleturquoise', 11:'mediumspringgreen'}
col_list = df_cc.Coded.values.tolist()
converted_colors = [colours[color] for color in col_list]
#cmap, norm = from_levels_and_colors([1,2,3,4,5,6,7,8,9,10,11], ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen'])
#plt.scatter(df_cc.Coded, df_cc.ratio, s=200, c=df_cc.Coded, cmap=cmap)
plt.scatter(df_cc.Coded, df_cc.ratio, c=converted_colors)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import from_levels_and_colors
#colours = [mcolors.darkviolet, mcolors.deepskyblue, mcolors.yellow, mcolors.darkgreen, mcolors.hotpink]
#colours = ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen']
colours = {1:'darkviolet', 2:'deepskyblue', 3:'yellow', 4:'darkgreen', 5:'hotpink', 6:'black', 7:'orange', 8:'dimgray', 9:'red', 10:'paleturquoise', 11:'mediumspringgreen'}
col_list = df_cc.Coded.values.tolist()
converted_colors = [colours[color] for color in col_list]
#cmap, norm = from_levels_and_colors([1,2,3,4,5,6,7,8,9,10,11], ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen'])
#plt.scatter(df_cc.Coded, df_cc.ratio, s=200, c=df_cc.Coded, cmap=cmap)
plt.scatter(df_cc.Coded, df_cc.followers, c=converted_colors)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import from_levels_and_colors
#colours = [mcolors.darkviolet, mcolors.deepskyblue, mcolors.yellow, mcolors.darkgreen, mcolors.hotpink]
#colours = ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen']
colours = {1:'darkviolet', 2:'deepskyblue', 3:'yellow', 4:'darkgreen', 5:'hotpink', 6:'black', 7:'orange', 8:'dimgray', 9:'red', 10:'paleturquoise', 11:'mediumspringgreen'}
col_list = df_cc.Coded.values.tolist()
converted_colors = [colours[color] for color in col_list]
#cmap, norm = from_levels_and_colors([1,2,3,4,5,6,7,8,9,10,11], ['darkviolet', 'deepskyblue', 'yellow', 'darkgreen', 'hotpink', 'black', 'orange', 'dimgray', 'red', 'paleturquoise', 'mediumspringgreen'])
#plt.scatter(df_cc.Coded, df_cc.ratio, s=200, c=df_cc.Coded, cmap=cmap)
plt.scatter(df_cc.Coded, df_cc.tweets, c=converted_colors)

In [None]:
df_box = df_cc.loc[df_cc['Coded'].isin([2, 3, 6, 7])]
bp = df_box.boxplot(column = 'tweets', by='Coded')

##

##8. Hashtags of selected accounts

In [None]:
MetzTilly = scraper.get_tweets("MetzTilly", mode='user', since='2020-01-01')

MetzTilly_tweets = []

for tweet in MetzTilly['tweets']:
    data = [tweet['link'], tweet['date'], tweet['stats']['retweets'], tweet['stats']['likes'], tweet['stats']['comments'], tweet['stats']['quotes'], tweet['text']]
    MetzTilly_tweets.append(data)

data8 = pd.DataFrame(MetzTilly_tweets, columns = ['link', 'date', 'retweets', 'likes', 'comments', 'quotes', 'text'])
data8.to_csv('MetzTilly_tweets.csv')

In [None]:
AWIntergroup = scraper.get_tweets("AWIntergroup", mode='user', since='2020-01-01')

AWIntergroup_tweets = []

for tweet in AWIntergroup['tweets']:
    data = [tweet['link'], tweet['date'], tweet['stats']['retweets'], tweet['stats']['likes'], tweet['stats']['comments'], tweet['stats']['quotes'], tweet['text']]
    AWIntergroup_tweets.append(data)

data9 = pd.DataFrame(AWIntergroup_tweets, columns = ['link', 'date', 'retweets', 'likes', 'comments', 'quotes', 'text'])
data9.to_csv('AWIntergroup_tweets.csv')

In [None]:
data8.retweets.max()

In [None]:
data9['hashtags'] = data9.text.str.findall(r'#.*?(?=\s|$)')
data8['hashtags'] = data8.text.str.findall(r'#.*?(?=\s|$)')

In [None]:
### find tweets without hashtags

data9[~data9['hashtags'].astype(bool)]

In [None]:
data10 = data8.explode('hashtags')
data11 = data9.explode('hashtags')

data10[['hashtags']] = data10[['hashtags']].fillna('')
data11[['hashtags']] = data11[['hashtags']].fillna('')

In [None]:
import string

data10['hashtags'] = data10['hashtags'].astype('string')
data11['hashtags'] = data11['hashtags'].astype('string')

data10['hashtags'] = data10['hashtags'].str.lower()
data11['hashtags'] = data11['hashtags'].str.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data10['hashtags'] = data10['hashtags'].apply(remove_punctuation)
data11['hashtags'] = data11['hashtags'].apply(remove_punctuation)
result = data11.dtypes
print(data11.text)

In [None]:
import matplotlib as plt

MT = data10.groupby('hashtags')['link'].nunique()
MT = pd.DataFrame(MT, columns=['link'])
MT = MT.sort_values(by=['link'])
print(MT.tail(20))

In [None]:
AW = data11.groupby('hashtags')['link'].nunique()
AW = pd.DataFrame(AW, columns=['link'])
AW = AW.sort_values(by=['link'])
print(AW.tail(20))

In [None]:
from collections import Counter
from nltk.corpus import stopwords
import numpy as np

stop = stopwords.words('english')

data9['text'] = data8['text'].astype(str) 

data9['text_without_stopwords'] = data9['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#print(Act4AnimalsEU)
data9['text_without_stopwords'] = data9['text_without_stopwords'].str.replace('nan','').replace('&','').replace('-','')

wl = Counter(" ".join(data9["text_without_stopwords"].str.lower()).split()).most_common(100)

#data = np.asarray(my_list)

import numpy as np

data = np.asarray(wl)
a = data.tolist()
a = pd.DataFrame(a, columns=['word', 'frequency'])
a.head(20)


In [None]:
data12 = data10.loc[data10['hashtags'] == 'animalwelfare']
data13 = data11.loc[data11['hashtags'] == 'animalwelfare']

In [None]:
len(data12)

##

##9. Text column cleanup

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
### insert df here
#data8 = XX

In [None]:
import demoji
datanew = data8.text.astype(str).apply(lambda x: demoji.replace(x,''))

In [None]:
data16 = datanew.tolist()

In [None]:
# Remove Emails
data16 = [re.sub('\S*@\S*\s?', '', sent) for sent in data16]

# Remove new line characters
data16 = [re.sub('\s+', ' ', sent) for sent in data16]

# Remove distracting single quotes
data16 = [re.sub("\'", "", sent) for sent in data16]

print(data16[:1])

In [None]:
# remove urls

import re

data16 = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', sent) for sent in data16]

#URLless_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', thestring)
print(data16[:1])

##

##10. Clustering of tweets using BERT

In [None]:
from sentence_transformers import SentenceTransformer

docs = data16
model = SentenceTransformer('Nhat1904/Final-32shots-Twitter-Skhead-Train-5epoch')
vectorized_docs = model.encode(np.asarray(docs))

print("Shape:", vectorized_docs.shape)

In [None]:
df = pd.DataFrame(data16, columns=['text_clean'])

In [None]:
### elbow analysis

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt  
%matplotlib inline


def graw_elbow_graph(x: np.array, k1: int, k2: int, k3: int):
    k_values, inertia_values = [], []
    for k in range(k1, k2, k3):
        print("Processing:", k)
        km = KMeans(n_clusters=k).fit(x)
        k_values.append(k)
        inertia_values.append(km.inertia_)

    plt.figure(figsize=(12,4))
    plt.plot(k_values, inertia_values, 'o')
    plt.title('Inertia for each K')
    plt.xlabel('K')
    plt.ylabel('Inertia')


graw_elbow_graph(vectorized_docs, 2, 50, 2)

In [None]:
## clustering -> needs k

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

df = pd.DataFrame(data16, columns=['text_clean'])

def make_clustered_dataframe(x: np.array, k: int) -> pd.DataFrame:
    """ Create a new dataframe with original docs and assigned clusters """
#    ids = df["id"].values
#    user_names = df["user_name"].values
    docs = df["text_clean"].values
#    tokenized_docs = df["text_clean"].map(text_to_tokens).values
    
    km = KMeans(n_clusters=k).fit(x)
    s_score = silhouette_score(x, km.labels_)
    print(f"K={k}: Silhouette coefficient {s_score:0.2f}, inertia:{km.inertia_}")
    
    # Create new DataFrame
    data_len = x.shape[0]
    df_clusters = pd.DataFrame({
#        "id": ids[:data_len],
#        "user": user_names[:data_len],
        "clean_text": docs[:data_len],
#        "tokens": tokenized_docs[:data_len],
        "cluster": km.labels_,
    })
    return df_clusters


#def text_to_tokens(text: str) -> List[str]:
#    """ Generate tokens from the sentence """
#    # "this is text" => ['this', 'is' 'text']
#    tokens = word_tokenize(text)  # Get tokens from text
#    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
#    return tokens


# Make clustered dataframe
k = 24
df_clusters = make_clustered_dataframe(vectorized_docs, k)
with pd.option_context('display.max_colwidth', None):
    display(df_clusters)

In [None]:
newish = data8.join(df_clusters, how='left')

In [None]:
newish2 = pd.read_csv('Maybe-Yes-No.csv', sep=';')
newish2 = newish2.rename(columns={'Name': 'username', 'Status': 'label'})

In [None]:
newish3 = newish.merge(newish2, on='username', how="left")

In [None]:
Yes = newish3.loc[newish3['label'] == 'Yes']
Maybe = newish3.loc[newish3['label'] == 'Maybe']
No = newish3.loc[newish3['label'] == 'No']

In [None]:
newish4 = newish3[['label', 'cluster']]

In [None]:
new_df = (
    pd.get_dummies(newish4, columns=['cluster'])
        .groupby('label', as_index=False)
        .sum()
)

In [None]:
new_df = new_df.set_index('label')
df1_transposed = new_df.T # Prepare Data
df1_transposed['No'] = df1_transposed[['No']] * -1

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

# Prepare Data
plt.rcParams["figure.figsize"] = (10, 8)

#Class
AgeClass = df1_transposed.index.tolist()
#Chart
bar_plot = sns.barplot(x='Yes', y=df1_transposed.index, data=df1_transposed, order=AgeClass, orient='horizontal', dodge=False)
bar_plot = sns.barplot(x='No', y=df1_transposed.index, data=df1_transposed, order=AgeClass, orient='horizontal', dodge=False)
plt.title("Cluster Distribution vs Assigned Label", fontsize=22)
plt.xlabel("No/Yes")

##

##11. Automatic categorizing using BERT

In [None]:
from transformers import pipeline

path_to_model = './1-BERTMODELS/category_bert_en_pt/'

# We are using the sentiment-analysis type (even though our model is not a sentiment analysis model)
pipe = pipeline('sentiment-analysis', model=path_to_model, tokenizer=path_to_model)

In [None]:
import pandas as pd

one = pd.read_csv('categories.csv', sep=';')
two = pd.read_csv('annotated_with_info-2.csv', sep=',')

In [None]:
one['username'] = one['username'].str.replace('@', '')
three = pd.merge(two, one, how='left', on='username')

In [None]:
# feed into 9. Text column cleanup

In [None]:
# Feed an example input
vectorized_docs2 = []
for ele in data16:
    vectorized_docs2.append(pipe(ele))
# output:
# [{'label': 'art', 'score': 0.9069588780403137}]

In [None]:
#df_modlab = pd.DataFrame.from_records(vectorized_docs2)
num = len(vectorized_docs2) + 1
lst1 = range(1, num)
df_modlab = pd.DataFrame.from_records(vectorized_docs2,index=lst1, columns=['Dic'])

In [None]:
df_modlab['Dic'].astype(str).map(eval)
df_modlab2 = df_modlab['Dic'].apply(pd.Series)

In [None]:
import matplotlib.pyplot as plt

# Pie chart (plots value counts in this case)
labels = df_modlab2['label'].dropna().unique()
actual_values = df_modlab2['label'].value_counts(dropna=True)

#choose your colors
#colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99','#fffd55']
 
fig1, ax1 = plt.subplots()

# To denote actual values instead of percentages as labels in the pie chart, reformat autopct
values=df_modlab2['label'].value_counts(dropna=True)
plt.pie(actual_values, autopct= lambda x: '{:.0f}'.format(x*values.sum()/100), startangle=90)


#draw circle (this example creates a donut)
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)


# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal') 

# A separate legend with labels (drawn to the bottom left of the pie in this case) 
plt.legend(labels, bbox_to_anchor = (0.1, .3))

plt.tight_layout()
plt.show()