In [1]:
# Import necessary packages

import time
import requests
import numpy as np                      
import pandas as pd
import elasticsearch

In [2]:
# Import ML packages

from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Filter warnings

import warnings
warnings.filterwarnings("ignore")

In [4]:
# Define helper methods
def modify_vectors(text_vectorizer,new_text):
    vect_text = text_vectorizer.transform([new_text])
    return list(vect_text.toarray()[0])
        
def predictor(text_vectorizer,km_model,new_text):
    vect_text = text_vectorizer.transform([new_text])
    cluster_pred = km_model.predict(vect_text)
    return cluster_pred[0]


In [5]:
# Set global variables

elastic_search = elasticsearch.Elasticsearch([{'host': 'localhost', 'port': 9200, 'use_ssl' : False, 'ssl_verify' : False}])
req = requests.get("http://localhost:9200", verify=False)
pca = PCA(n_components=2)

unique_processed_tweets = set()
data_list = []
iterations = 1

vectorizer = TfidfVectorizer(stop_words='english')
global_df = pd.DataFrame(columns=['id', 'processed_tweet'])


In [None]:
while iterations <= 20:
    
    # Get data from the elastic search index
    elastic_search_result = elastic_search.search(index="ukraine_index", doc_type="_doc", body={
          'size' : 1000,
          'query': {'match_all' : {}}
    })
    
    # Check hits & get data
    for record in elastic_search_result['hits']['hits']:
        preprocessed_tweet = record['_source']['processed_tweet']
        if preprocessed_tweet not in unique_processed_tweets:
            record_dict = {'id':record['_id'], 'processed_tweet':preprocessed_tweet}
            data_list.append(record_dict)
            unique_processed_tweets.add(preprocessed_tweet)
            
    temp_df = pd.DataFrame(data_list)
    global_df = global_df.append(temp_df, ignore_index=True)
        
    tweets_list = global_df.processed_tweet.tolist()
    #print(tweets_list)
        
    X = vectorizer.fit_transform(tweets_list)
    model = KMeans(n_clusters=3, init='k-means++', max_iter=5000) #, n_init=1)
    model.fit(X)
        
    # Set data for the cluster & tweet_vector
    global_df['tweet_vector'] = global_df.processed_tweet.apply(lambda x:modify_vectors(vectorizer,x))
    global_df['cluster_number'] = global_df.processed_tweet.apply(lambda x:predictor(vectorizer,model,x))
    global_df.value_counts('cluster_number')

    x_pca = pca.fit_transform(list(global_df.tweet_vector))
    two_d_data = pd.DataFrame(x_pca).rename(columns={0:'x',1:'y'})
    full_data = global_df.join(two_d_data)
            
    # Obtain plots
    labels = ['Cluster1', 'Cluster2', 'Cluster3']
    plt.figure(figsize=(15, 10))
    plt.xticks(np.arange(-0.5, 1, step=0.1))
    plt.yticks(np.arange(-0.5, 1, step=0.1))
    scatter=plt.scatter(full_data.x,full_data.y,c=global_df.cluster_number)
    plt.legend(handles=scatter.legend_elements()[0], labels=labels)
        
    # Save plots for every 5 iterations
    if iterations % 5 == 0:
        print("Size of data collected after ", iterations, " iterations: ", global_df.shape)
        plt.savefig('./Result-after-iteration'+str(iterations)+'.png')   

    iterations += 1
    time.sleep(30)
            