In [None]:
#Importing necessary library
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import advertools as adv
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pd.options.mode.chained_assignment = None

In [None]:
df = pd.read_excel('')

# Remove timezone from columns
df['date'] = df['date'].dt.tz_localize(None)

## Data Exploration

In [None]:
df_list = []
for row in df.iterrows():
    if 'x' in (row[1]['user']):
        df_list.append(row[1])
df = pd.DataFrame(df_list)

In [None]:
emoji_summary = adv.extract_emoji(df['rawContent'])
emoji_summary.keys()

In [None]:
emoji_summary['overview']

In [None]:
list(zip(emoji_summary['emoji_flat'][:10], emoji_summary['emoji_flat_text'][:10]))

In [None]:
emoji_summary['top_emoji'][:20]

In [None]:
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.barh([x[0] for x in emoji_summary['top_emoji_text'][:20]][::-1],
         [x[1] for x in emoji_summary['top_emoji_text'][:20]][::-1])
plt.title('Top Emoji')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)

In [None]:
hashtag_summary = adv.extract_hashtags(df['rawContent'])
hashtag_summary.keys()

In [None]:
hashtag_summary['overview']

In [None]:
hashtag_summary['top_hashtags'][:30]

In [None]:
plt.figure(facecolor='#ebebeb', figsize=(8, 12))
plt.barh([x[0] for x in hashtag_summary['top_hashtags'][0:][:30]][::-1],
         [x[1] for x in hashtag_summary['top_hashtags'][0:][:30]][::-1])
plt.title('Top Hashtags')
plt.grid(alpha=0.5)

In [None]:
mention_summary = adv.extract_mentions(df['rawContent'])
mention_summary.keys()

In [None]:
mention_summary['overview']

In [None]:
mention_summary['top_mentions'][:21]

In [None]:
question_summary = adv.extract_questions(df['rawContent'])
question_summary.keys()

In [None]:
question_summary['overview']

In [None]:
[(i,x) for i, x in  enumerate(question_summary['question_text']) if x][:15]

#### Frequency counts

In [None]:
df['date'] = pd.to_datetime(df['date'])

df.groupby(pd.Grouper(key='date', axis=0, 
                      freq='M')).sum()


#### Find Followers and Followings

#### Perform Iterations of the word cloud

In [None]:
# METHOD 2: GENERATING WORD CLOUD AFTER TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df['tweets_clean_lemmatized'].tolist()

# Create a TfidfVectorizer object and fit it to the preprocessed corpus
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

# Get list of feature names that correspond to the columns in the TF-IDF matrix
print("Feature Names:\n", vectorizer.get_feature_names_out())

# Transform the preprocessed corpus into a TF-IDF matrix
tdm = vectorizer.transform(corpus)

# Print the resulting matrix
tf_idf_matrix = tdm.toarray()
print("TF-IDF Matrix:\n", tf_idf_matrix)

In [None]:
# Python code to convert into dictionary
def Convert(tup, di):
    di = dict(tup)
    return di
dictionary = {}
freqs = [(word, tdm.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
d = Convert(freqs, dictionary)

In [None]:
w = WordCloud(mode='RGBA',
              stopwords=stop_words,
              background_color='white',
              max_words=1000, 
              height = 2000, 
              width=4000, 
              font_path=fp).generate_from_frequencies(d)
plt.figure(figsize = (16,8))
plt.imshow(w) 
plt.axis('off') 
plt.show()

In [None]:
sorted(d.items(), key=lambda x: (-x[1], x[0]))

#### Look at the EN tweets he made

In [None]:
en_tweets = df[df['lang'] == 'en']
len(en_tweets)

In [None]:
en_corpus = en_tweets['tweets_clean_lemmatized'].tolist()

# Create a TfidfVectorizer object and fit it to the preprocessed corpus
vectorizer = TfidfVectorizer()
vectorizer.fit(en_corpus)

# Get list of feature names that correspond to the columns in the TF-IDF matrix
print("Feature Names:\n", vectorizer.get_feature_names_out())

# Transform the preprocessed corpus into a TF-IDF matrix
tdm = vectorizer.transform(en_corpus)

# Print the resulting matrix
tf_idf_matrix = tdm.toarray()
print("TF-IDF Matrix:\n", tf_idf_matrix)

In [None]:
en_dictionary = {}
freqs = [(word, tdm.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
d = Convert(freqs, en_dictionary)

In [None]:
w = WordCloud(mode='RGBA',
              stopwords=stop_words,
              background_color='white',
              max_words=1000, 
              height = 2000, 
              width=4000, 
              font_path=fp).generate_from_frequencies(d)
plt.figure(figsize = (16,8))
plt.imshow(w) 
plt.axis('off') 
plt.show()

References: 
- https://medium.com/geekculture/how-to-extract-reddit-posts-for-an-nlp-project-56d121b260b4
- https://praw.readthedocs.io/en/stable/tutorials/comments.html

## Network Analysis

In [None]:
df2 = pd.read_excel('', sheet_name='Sheet2')
df2.head()

#### Change the orientation of the dataframe

In [None]:
df2['Source'] = df2[df2.columns[0:2]].apply(
    lambda x: ' '.join(x.dropna().astype(str)), 
    axis=1
)
df2['Target'] = df2[df2.columns[3:5]].apply(
    lambda x: ' '.join(x.dropna().astype(str)), 
    axis=1
)

df2.head()

#### Plot the network for following/followers on twitter

#### Method 1: Pyvis Visualisation

In [None]:
import networkx as nx
G = nx.from_pandas_edgelist(df2, source='Source', target='Target')

In [None]:
from pyvis.network import Network

net = Network(notebook=True)
net.from_nx(G)
net.show('example.html')

#### Mentions map

In [None]:
mention_temp = df['mention'].str.split(',').explode('mention').value_counts()
mention_temp_df = pd.DataFrame(mention_temp).reset_index()
mention_temp_df = mention_temp_df.rename(columns={'index':'target', 'mention':'mention_counts'})
mention_temp_df['source'] = ''
mention_temp_df

In [None]:
import networkx as nx
G_nx = nx.from_pandas_edgelist(mention_temp_df, source='source', target='target')

# saving graph created above in gexf format
#nx.write_gexf(G_nx, "mentions_map.gexf")

In [None]:
from pyvis.network import Network

net = Network(notebook=True)
net.from_nx(G_nx)
net.show('example_mentions.html')

#### Method 2: Plotly Dash

In [None]:
import dash
import visdcc
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

In [None]:
# create app
app = dash.Dash()
node_list = list(set(df2['Source'].unique().tolist() + df2['Target'].unique().tolist()))
nodes = [{'id': node_name, 'label': node_name, 'shape': 'dot', 'size': 7} 
        for i,node_name in enumerate(node_list)]

# create edges from df
edges = df2
for row in df2.to_dict(orient='records'):
    source, target = row['Source'], row['Target']
    edges.append({
        'id': source + "_" + target, 
        'from': source,
        'to': target, 
        'width': 2
    }, ignore_index=True)
    
# define layout
app.layout = html.Div([visdcc.Network(id='net', data={'nodes': nodes, 'edges': edges}, options=dict(height='600px',width='100%')),
                      dcc.RadioItems(id='color', 
                                     options=[{'label': 'Red', 'value':'#ff0000'}, 
                                             {'label': 'Green', 'value':'#00ff00'}, 
                                             {'label': 'Blue', 'value':'#0000ff'}], 
                                    value='Red')])
# define callback
@app.callback(Output('net', 'options'), 
             [Input('color', 'value')])                       
def myfunc(x):
    return {'nodes': {'color': x}}
# define main calling
if __name__ == '__main__':
    app.run_server(debug=False)

## TOPIC MODELLING

#### METHOD 1: BERTopic

#### METHOD 2: TweetNLP


Refer to Google Colab

In [None]:
# Find the largest category for each row
cols = ['arts_&_culture', 'business_&_entrepreneurs', 'celebrity_&_pop_culture', 'diaries_&_daily_life', 'family', 'fashion_&_style', 'film_tv_&_video', 'fitness_&_health', 'food_&_dining', 'gaming', 'learning_&_educational', 'music', 'news_&_social_concern', 'other_hobbies', 'relationships', 'science_&_technology', 'sports', 'travel_&_adventure', 'youth_&_student_life']

df['Topic Classification'] = df[cols].idxmax(axis=1)

In [None]:
# Find the biggest sentiment for each row
sentiment_cols = ['negative', 'neutral', 'positive']
df['Sentiment Analysis'] = df[sentiment_cols].idxmax(axis=1)

In [None]:
# Irony detection for each row
irony_cols = ['non_irony', 'irony']
df['Irony Detection'] = df[irony_cols].idxmax(axis=1)

In [None]:
# Hate detection for each row
hate_cols = ['NOT-HATE', 'HATE']
df['Hate Detection'] = df[hate_cols].idxmax(axis=1)

In [None]:
df = pd.read_excel('')

In [None]:
df.iloc[:, 75:].columns

In [None]:
cols = ['Tweets_Eng', 'date', 'type_1', 'entity_1', 'probability_1', 'type_2', 'entity_2', 'probability_2', 'type_3', 'entity_3', 'probability_3', 'type_4', 'entity_4', 'probability_4', 'type_5', 'entity_5', 'probability_5', 'type_6', 'entity_6', 'probability_6', 'type_7', 'entity_7', 'probability_7', 'type_8', 'entity_8', 'probability_8', 'type_9', 'entity_9', 'probability_9']
filtered_df = df[cols]
filtered_df['id'] = filtered_df.index
filtered_df

In [None]:

x = pd.wide_to_long(
    df.reset_index(), i=["id", "renderedContent"], j="value", stubnames=["type", "entity", "probability"], sep = "_", suffix="\d+"
)
x.reset_index()

#### METHOD 3: ZERO SHOT CLASSIFICATION

Part 1: English Language

In [None]:
df['Tweets_Eng'] = df['Tweets_Eng'].astype(str)
df['Tweets_Eng'].tail()

In [None]:
# 1. REMOVE PUNCTUATION
import string
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

df['Tweets_Eng_Clean'] = df['Tweets_Eng'].apply(lambda x:remove_punctuation(x))

# 2. LOWER CASING
df['Tweets_Eng_Clean'] = df['Tweets_Eng_Clean'].apply(lambda x: x.lower())

# 3. REPLACE CONTRACTIONS
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

df['Tweets_Eng_Clean'] = df['Tweets_Eng_Clean'].apply(lambda x: replaceContraction(x))

# 3. TOKENIZATION
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

df['Tweets_Eng_Clean'] = df['Tweets_Eng_Clean'].apply(lambda x: tokenization(x))

# 4. REMOVE STOPWORDS
import nltk
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

df['Tweets_Eng_Clean'] = df['Tweets_Eng_Clean'].apply(lambda x:remove_stopwords(x))

# 5. LEMMATIZATION
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

df['Tweets_Eng_Clean'] = df['Tweets_Eng_Clean'].apply(lambda x:lemmatizer(x))

In [None]:
df['Tweets_Eng_Clean']

In [None]:
df_list = []
for row in df.iterrows():
    if '' in (row[1]['user']):
        df_list.append(row[1])
df = pd.DataFrame(df_list)

In [None]:
from transformers import pipeline

pipe = pipeline(model="facebook/bart-large-mnli")

In [None]:
# candidate_labels
labels = ["Military", "Healthcare", "Nation Building", "Politics", "Economy", "Food", "Volunteer", "Bilateral Meeting", "Education"]

In [None]:
# Initializing Zero-Shot Classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

#classifier(df['Tweets_Eng_Clean'][0], labels, multi_label=True) 

Part 2: Native Language

In [None]:
classifier('ប្រសាសន៍ ឯកឧត្តម បណ្ឌិត ហ៊ុន ម៉ាណែត ថ្លែងក្នុងពិធីសម្ពោធដាក់ឱ្យប្រើប្រាស់ជាផ្លូវការស្ពានបេតុងចំនួន ៩', labels, multi_label=True)

#### METHOD 4: GSDMM 

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess

# cast tweets to numpy array
docs = result

# be sure to split sentence before feed into Dictionary
dataset = [d.split() for d in docs]

# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(dataset)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in dataset]

# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(dataset, vocab_length)

In [None]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 10)

# VISUALISATION

Part 1: TweetNLP Categories

In [None]:
df.head()

In [None]:
df.columns.get_loc("youth_&_student_life")

In [None]:
tweetnlp_topic = df.iloc[:, 36:55]
cols = tweetnlp_topic.columns
cols

In [None]:
def return_max(row, cols):
    return cols[np.argmax(row.values)]

In [None]:
df['Topic Classification'] = tweetnlp_topic.apply(lambda x: return_max(x,cols), axis=1)
df.head()

In [None]:
counts = df['Topic Classification'].value_counts().rename_axis('Topic').reset_index(name='count')
counts['count'] = counts['count'].astype(int)
ax = sns.barplot(y=counts['Topic'], x=counts['count'], data=counts, orient='h')
ax.bar_label(ax.containers[0])

In [None]:
sns.set(rc={'figure.figsize':(30,20)})
sns.histplot(data=df, y='Topic Classification', discrete=True, legend=True)

Part 2: Sentiment Analysis

In [None]:
counts = df['Sentiment Analysis'].value_counts().rename_axis('Sentiments').reset_index(name='count')
counts['count'] = counts['count'].astype(int)
ax = sns.barplot(y=counts['Sentiments'], x=counts['count'], data=counts, orient='h')
ax.bar_label(ax.containers[0])

Perform word cloud on entity

In [None]:
import spacy

In [None]:
entity_list = df['entity'].astype(str).tolist()
entity_list = [s.strip() for s in entity_list]
entity_list = [x for x in entity_list if str(x) != 'nan']

In [None]:
# import these modules
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS

stop_words = ['amp'] + list(STOPWORDS)

In [None]:
import matplotlib.pyplot as plt

words = " ".join(text for text in entity_list) 
fp = 'font/gargi.ttf'
wordcloud = WordCloud(stopwords=stop_words,
                      background_color = 'white', 
                      max_words=1000, 
                      height = 2000, 
                      width=4000, 
                      font_path=fp).generate(words) 
plt.figure(figsize = (16,8))
plt.imshow(wordcloud) 
plt.axis('off') 
plt.show()