---------------------------------------------------------------------
# Twitter tweets - download all hashtag data
---------------------------------------------------------------------

### There are limitations in using Tweepy for scraping tweets. 
### The standard API only allows you to retrieve tweets up to 7 days ago 
### and is limited to scraping 18,000 tweets per a 15 minute window.


In [None]:
import tweepy as tw
import pandas as pd 
import re
import pickle
from tqdm import tqdm

#my_path = '/Users/Petra_Kummerova/Desktop/Python/NLP/Czechitas/'
#my_out_USE = "my_out_USE.csv"
#cd '/Users/Petra_Kummerova/Desktop/Python/NLP/Czechitas'

# Twitter API deatils
consumer_key= 'XXX'
consumer_secret= 'XXX'
access_token= 'XXX'
access_token_secret= 'XXX'

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)






### Initial data cleaning 

------------------------------------------------------------
#### Remove http and https link (find https until first space)
------------------------------------------------------------



In [None]:
# clean tweets list for special characters
def clean_tweets(tweets_list_all):
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    messages = [pattern.sub('', s) for s in tweets_list_all]
    return messages


### Tweets Download

In [None]:
def tweets_dwnld(new_search,date_since):
    tweets = tw.Cursor(api.search,
                q=new_search,
                # geocode=geocodes,
                lang="en",
                since=date_since).items(1000)
    tweets_list = []
    tweets_authors = []
    tweets_geo = []
    tweets_date = []
    
    for tweet in tweets:
        tweets_list = tweets_list + [tweet.text]
        tweets_authors = tweets_authors + [tweet.user.name]
        tweets_geo = tweets_geo + [tweet.place]
        tweets_date = tweets_date + [tweet.created_at]
    # clean tweets for special characters
    messages = clean_tweets(tweets_list)
    df_tweets = pd.DataFrame({'topic': new_search, 'tweet': messages, 'author': tweets_authors, 'geo': tweets_geo, 'twdate': tweets_date})
    return df_tweets

def search_tweets(search_word_list, date_since):

    df_tweets_all = []
    for search_word in tqdm(search_word_list):
        print(f"Currently searching for {search_word}",end='\r')
        new_search = search_word + " -filter:retweets"
        df_tweets = tweets_dwnld(new_search,date_since)
        df_tweets_all.append(df_tweets)
      #  print("end of one loop")
    return df_tweets_all

In [None]:
# Define the search term and the date_since date as variables
search_word_list = ["#Keboola","#python","#Tableau","#Snowflake","#R"]
geocodes = "50.0755, 14.4378°, 500km"
# Prague 50.0755° N, 14.4378° E
# search_word = "#Keboola" 
# new_search = search_word + " -filter:retweets"


date_since = "2021-08-20" # only goes back max around 7 days anyways..


In [None]:
df_tweets_all = search_tweets(search_word_list, date_since = date_since)

#### Save into a pickle file.

In [None]:

                                                                                                                                                                                                                   
pickle.dump( df_tweets_all, open( "df_tweets_all.p", "wb" ) )


#### Load the dictionary back from the pickle file.

In [None]:

df_tweets_all_load = pickle.load( open( "df_tweets_all.p", "rb" ) )



#### Check one-by-one tweet categories (#Keboola","#python","#Tableau","#Snowflake","#R)

In [None]:

print(df_tweets_all_load[0])
print(df_tweets_all_load[1])
print(df_tweets_all_load[2])
print(df_tweets_all_load[3])
print(df_tweets_all_load[4])

df_tweets_all_load_joined = pd.concat([df_tweets_all_load[0], df_tweets_all_load[1],df_tweets_all_load[2],df_tweets_all_load[3]], df_tweets_all_load[4], ignore_index=True)


In [None]:
df_tweets_all_load_joined

In [None]:
# example one data.frame (on python tweets)
messages = df_tweets_all_load_joined["tweet"]

# Sentence Embeddings using USE 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns





def generate_sentence_embeddings(sentences_list, module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"):
    
    embed = hub.load(module_url)
    sentence_embeddings = embed(sentences_list)
    
    return sentence_embeddings


### Generate embeddings for tweets

In [None]:
message_embeddings = generate_sentence_embeddings(messages)

In [None]:
print(message_embeddings)

message_embeddings = message_embeddings.numpy()

# Clustering the embeddings 



### K-means clustering 


In [None]:


import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

import pandas as pd

from sklearn.cluster import KMeans
import collections

myX_labels_orig = pd.DataFrame(messages)
myX_labels_orig.columns = ["tweets_orig"]

myX = pd.DataFrame(message_embeddings)

X = myX
X = X.apply(pd.to_numeric)

# k-means clustering from text embeddings
n_clusters_proportionate = int(round(len(X)/20,0))

kmeans_plus = KMeans(init='k-means++', n_clusters = n_clusters_proportionate, n_init=10)
kmeans_plus.fit(X)
cluster_labels = kmeans_plus.labels_
# print(kmeans_plus.labels_)
collections.Counter(kmeans_plus.labels_)

# join labels with predicted clusters
out = pd.DataFrame(columns = ['original_ID', 'cluster_ID'])
out['original_ID'] = myX_labels_orig['tweets_orig'].values
out['cluster_ID'] = cluster_labels





In [None]:
# save output to csv
out['original_ID'] = out['original_ID'].str.replace(';',' ')
out.to_csv(my_path + my_out_USE, sep = ';')

# check some random clusters
def check_cluster(out_df, cluster):
    out_df = out_df[out_df['cluster_ID'] == cluster]
    return out_df

test =  check_cluster(out,5)
print(test)



In [None]:
out[out.cluster_ID == 2]

## TF-IDF to determine top topics within in each cluster

### Data preparation

In [None]:
def tfidf_preproc(text):

    text = text.replace('.',' ')
    text = text.replace('#','')
    text = re.sub(r'\s+',' ',re.sub(r'[^\w \s]','',text) ).lower()

    return text

In [None]:
out['cleaned'] = [tfidf_preproc(i) for i in out.original_ID]                

In [None]:
out

#### Split the dataframe into clusters

In [None]:
gb = out.groupby('cluster_ID')    
[gb.get_group(x) for x in gb.groups]

In [None]:
   
out_groups = [gb.get_group(x) for x in gb.groups]

In [None]:
final = [''.join(i.cleaned) for i in out_groups]

#### TF-IDF Vectorization 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(final)
names = vectorizer.get_feature_names()
data = vectors.todense().tolist()
# Create a dataframe with the results
df = pd.DataFrame(data, columns=names)


In [None]:
df

### Remove all columns containing a stop word from the resultant dataframe. 

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
st = set(stopwords.words('english'))

df = df[filter(lambda x: x not in list(st) , df.columns)]

### Check top 10 topics within each cluster 

In [None]:
out_cluster_topics = pd.DataFrame(columns = ['tfidf', 'topics','cluster_ID'])

    n = X_topics;
    for i in df.iterrows():
        print(i[1].sort_values(ascending=False)[:n])
        my_df = pd.DataFrame(i[1].sort_values(ascending=False)[:n])
        cluster_id = list(my_df)[0]
        my_df['topics'] = my_df.index
        my_df.columns = ['tfidf', 'topics']
        my_df['cluster_ID'] = cluster_id
        out_cluster_topics = pd.concat([out_cluster_topics,my_df], ignore_index=True)

    return out_cluster_topics


### Join with full data to export to Tableau

In [None]:

out_cluster_topics = add_top_X_topics(df,10)
out_all_columns = pd.merge(out, out_cluster_topics, right_on = 'cluster_ID', left_on = 'cluster_ID')
out_all_columns.to_csv(my_path + my_out_USE_all_columns, sep = ';')



# Dimensionality reduction using PCA and plotting 

In [None]:
import numpy as np
from sklearn.decomposition import PCA


In [None]:
def generate_pca_coordinates(original_vectors_array, num_components = 2):
    X = original_vectors_array
    pca = PCA(n_components=num_components)
    pca_embeddings = pca.fit_transform(X)
    pca_coords = pd.DataFrame(pca_embeddings,
                          columns=['x','y'])
    return pca_coords

In [None]:
out

In [None]:
out_plot = out.copy()

In [None]:
pca_coords = generate_pca_coordinates(message_embeddings, num_components=2)

In [None]:
out_plot['x'] = pca_coords['x']

In [None]:
out_plot['y'] = pca_coords['y']

In [None]:

import plotly.express as px

fig = px.scatter(out_plot, x="x", y="y", color="cluster_ID",
                 hover_data=['cleaned'])
fig.show()

# TO DO hierarchical clustering

In [None]:
# hierarchical clustering
from scipy.cluster.hierarchy import dendrogram, linkage  
from matplotlib import pyplot as plt

#linked = linkage(np.array(X), 'single')
linked = linkage(np.array(X), 'ward')
labelList = myX_labels_orig.values






In [None]:
plt.figure(figsize=(10, 7))  
dendrogram(linked,  
            orientation='top',
            labels=labelList,
            distance_sort='descending',
            show_leaf_counts=True)
plt.show()  

