# Data description
In order to collect data in a natural way:
<br>- we registered Twitter Developer account;
<br>- using credentials from Twitter Developer account we run script that collected tweets by the geolocation and saved them in mongodb;
<br>
<br><b>As a result:</b>
<br>- we collected  332548 tweets (10Gb in mongodb, ~100Mb in csv) from New-York geolocation since 30 of May up to 15 of June;
<br>- we collected  6617029 tweets (~1.69Gb in csv) from USA geolocation since 15 of June up to now.

### Import all needed components

In [1]:
import findspark
findspark.init()

In [3]:
import pyspark
import operator
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType, StructField, StructType
from pyspark.sql.functions import udf, row_number,column

# processing
import re
from datetime import datetime

# text preprocessing
import re
import nltk
from nltk.stem import WordNetLemmatizer 
from pyspark.ml.feature import CountVectorizer,StopWordsRemover, HashingTF, IDF, Tokenizer

#staff for LDA
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector as oldVector, Vectors as oldVectors
from pyspark.ml.linalg import Vector as newVector, Vectors as newVectors

# import hardcoded variables
from variables import channels_not_to_consider

#for debug purpose only
import time

#pytrends - for acquiring google trends
from get_google_trends_data.pytrends.pytrends.request import TrendReq

ModuleNotFoundError: No module named 'get_google_trends_data'

In [None]:
# nltk.download('stopwords')
# nltk.download('wordnet')

### Datetype functions

In [None]:
wrong_date = datetime.strptime("Mon Jun 03 00:00:00 +0000 2000", '%a %b %d %H:%M:%S %z %Y')

def validate(date_text):
    try:
        if date_text != datetime.strptime(date_text, '%a %b %d %H:%M:%S %z %Y').strftime('%a %b %d %H:%M:%S %z %Y'):
            raise ValueError
        return True
    except ValueError:
        return False

def str_tweet_to_datetime(frame_datetime):
    if (validate(frame_datetime) == True):
        return datetime.strptime(frame_datetime,'%a %b %d %H:%M:%S %z %Y')
    else:
        return wrong_date

def datetime_to_tweet_str(frame_datetime):
    ts = datetime.strftime(frame_datetime, '%a %b %d %H:%M:%S %z %Y')
    return ts

### Global variables definition

In [None]:
#final of league championship 
lc_final_start_datetime = "Sat Jun 01 00:00:00 +0000 2019"
lc_finish_finish_datetime = "Sat Jun 01 23:59:59 +0000 2019"

#Stanley cup final
stanley_final_start_datetime = "Wed Jun 12 00:00:00 +0000 2019"
stanley_finish_finish_datetime = "Wed Jun 12 23:59:59 +0000 2019"

#Draft NBA
nba_final_start_datetime = "Thu Jun 20 00:00:00 +0000 2019"
nba_finish_finish_datetime = "Sun Jun 23 23:59:59 +0000 2019"

**User-specific variables**  
Please feel free to tweak those variables as you wish. For example, you can set number of last hours to get hottest topics.

In [None]:
# if True locations from locations_to_consider will be used to filter
get_from_location = True

# locations to filter relevant tweets
locations_to_consider = [
                         'Manhattan, NY', 
                         'Brooklyn, NY', 
                         'Queens, NY', 
                         'Bronx, NY', 
                         'Staten Island, NY'
                         'New York, USA'
                        ]

geo = "US-NY" #US for USA

number_of_hours_to_get_topics = 2
num_of_top_interest = 15

# Set window time for interesting
frame_start_datetime = str_tweet_to_datetime(stanley_final_start_datetime)
frame_finish_datetime = str_tweet_to_datetime(stanley_finish_finish_datetime)

assert (frame_finish_datetime - frame_start_datetime).days <= 3, "Date interval should not be bigger than 3 days"

**Technical variables**  
Those variables are needed to connect to db and other technical stuff.

In [None]:
# LDA params
num_of_topics_LDA = 10
max_iterations_LDA = 100
nomber_of_words_per_topic = 15  # number of words per topic

# path to CSV
historical_tweets_data = './get-tweets-by-geolocation/data/new_york_training_tweets_15_06.csv'
# historical_tweets_data = './get-tweets-by-geolocation/training_tweets.csv'

# MongoDB table
real_time_tweets_table = "usa_training_tweets_04_07.training_tweets_collection"

### Create spark session

In [None]:
spark = SparkSession.builder.appName("pipeline") \
    .config('spark.mongodb.input.uri', 'mongodb://localhost:27017/'+real_time_tweets_table) \
    .config('spark.mongodb.output.uri', 'mongodb://localhost:27017/'+real_time_tweets_table) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1') \
    .config('spark.mongodb.input.partitioner', 'MongoPaginateBySizePartitioner') \
    .getOrCreate()
sc = spark.sparkContext

### Handy functions

**Text preprocessing and filtering**

In [None]:
def filter_tweet(tweet, channels_not_to_consider):
    
    if not isinstance(tweet, str):
        is_filtered = True
    elif len(tweet.split(' ')) < 3:
        is_filtered = True
    else: 
        is_filtered = False
        
    return not is_filtered
         
def process_tweet(tweet):
   
    tweet = tweet.lower() # get lowercase
    tweet = re.sub(r'@\w+', '', tweet) # filter words with non-letters at the beginning (mainly for mentions)
    tweet = re.sub(r'http://\S{,280}', '', tweet) # filter http
    tweet = re.sub(r'https://\S{,280}', '', tweet) # filter https
    tweet = re.sub(r'[^A-Za-z]', ' ', tweet) # filter all non-letters
    tweet = re.sub(r'\s{2,}', ' ', tweet) # remove multiply whitespaces
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet) # remove repeated chars (e.g. "greeeeat" -> "great")
    tweet = tweet.strip() # remove possible whitespaces from both sides of the tweet

    # lemmatize, tokenize and conquer
    processed_tweet = [lemmatizer.lemmatize(token) for token in tokenizer.tokenize(tweet)
                       if token not in stop_word_list]
    
    return processed_tweet

#### Google trends

In [None]:
#TODO: move this function to Handy function block 
def get_google_trends_by_geo(geo):
    if geo == 'US':
        return google_trends_search_topics_us, google_trends_search_queries_us
    elif geo == 'US-NY':
        return google_trends_search_topics_us_ny, google_trends_search_queries_us_ny
    
    return None, None

# How to call this block with functions?

In [None]:
def tweet2google_timeframe(frame_start_datetime, frame_finish_datetime):
    start_date = str_tweet_to_datetime(frame_start_datetime)
    end_date = str_tweet_to_datetime(frame_finish_datetime)
    tim
    
def get_google_trends_by_geo(geo):
    if geo == 'US':
        return google_trends_search_topics_us, google_trends_search_queries_us
    elif geo == 'US-NY':
        return google_trends_search_topics_us_ny, google_trends_search_queries_us_ny
    
    return None, None

In [None]:
#TODO: move this function to utils
def str_rising_to_float(str):
    if str is None:
        return 0.0
    if str == '':
        return 0.0
    if str == 'Breakout':
        return 0.0
    
    str_value = str.split('%')[0]
    if '+' in str_value:
        str_value = str_value.split('+')[1]
        
    if ',' in str_value:
        str_value = str_value.replace(',', '.')
        value = 1000* float(str_value)
        return value
    return float(str_value)

In [None]:
#TODO: move this function to utils
def unique_google_trends_by_time_frame(df):
    data = df.collect()
    rising_dict = {}
    top_dict = {}
    
    geo = data[0]['geo']
    columns = df.columns

    for i in range(0, len(data)):
        rising_val = data[i][columns[1]]
        top_value = data[i][columns[2]]
        
        if rising_val in rising_dict:
            rising_dict[rising_val][0] += str_rising_to_float(data[i][columns[3]])
            rising_dict[rising_val][1] += 1
        else:
            rising_dict[rising_val] = [str_rising_to_float(data[i][columns[3]]), 1]
            
        if top_value in top_dict:
            top_dict[top_value][0] += float(data[i][columns[4]])
            top_dict[top_value][1] += 1
        else:
            top_dict[top_value] = [float(data[i][columns[4]]), 1]
    
    
    for key in top_dict:
        top_dict[key] = round(top_dict[key][0] / top_dict[key][1])
        
    for key in rising_dict:
        rising_dict[key] = round(rising_dict[key][0] / rising_dict[key][1])
    
    top_dict = sorted(top_dict.items(), key=operator.itemgetter(1), reverse=True)
    rising_dict = sorted(rising_dict.items(), key=operator.itemgetter(1), reverse=True)
    
    
    seq = []
    len_top = len(top_dict)
    len_rising = len(rising_dict)
    length = max(len_top, len_rising)
    
    row = Row(columns[1], columns[2], columns[3], columns[4], columns[5])
    
    for i in range(0, length):
        rising = rising_dict[i][0] if i < len_rising else ''
        rising_val = f"+{rising_dict[i][1]}%" if i < len_rising else None
        
        top = top_dict[i][0] if i < len_top else ''
        top_val = top_dict[i][1] if i < len_top else None
        
        seq.append(row(rising, top, rising_val, top_val, geo))
    
    dframe = spark.createDataFrame(seq)
    return dframe

In [None]:
def get_geo_name(geo):
    if geo == "US-NY":
        return "New York"
    elif geo == "US":
        return "United States"
    return ""

def print_google_trend_title(start_date, finish_date, name):
    start_date_str = start_date.strftime("%Y-%m-%d")
    if start_date == finish_date:
        print(f"\nGoogle trends {name} in {get_geo_name(geo)} during {start_date_str}")
    else:
        finish_date_str = finish_date.strftime("%Y-%m-%d")
        print(f"\nGoogle trends {name} in {get_geo_name(geo)} during {start_date_str} - {finish_date_str}")

In [None]:
def convert_datetime_in_interesting_google(df):
    columns = df.columns
    converted_df = df.rdd.map(lambda x : (
                                          x["Date"].strftime("%Y-%m-%d"), 
                                          x[columns[1]], 
                                          x[columns[2]], 
                                          x[columns[3]],
                                          x[columns[4]],
                                          x[columns[5]])).toDF([columns[0], columns[1], columns[2], columns[3], columns[4], columns[5]])
                                                
    return converted_df

# Load the data


## Loading Google Trends data

In [None]:
google_trends_search_queries_us = spark.read.csv('data/google-trends/google-trends-search-queries-US.csv', inferSchema=True, header=True)
google_trends_search_topics_us = spark.read.csv('data/google-trends/google-trends-search-topics-US.csv', inferSchema=True, header=True)
google_trends_search_queries_us_ny = spark.read.csv('data/google-trends/google-trends-search-queries-US-NY.csv', inferSchema=True, header=True)
google_trends_search_topics_us_ny = spark.read.csv('data/google-trends/google-trends-search-topics-US-NY.csv', inferSchema=True, header=True)

## Reading the historical data, it can take a while

In [None]:
times = (frame_start_datetime, frame_finish_datetime)

print("Time range to be extracted from ", historical_tweets_data, times[0], times[1])

In [None]:
# This function extracts data from *.csv with collected tweets 
# params:
# - historical_start_time: initial date for data extraction
# - historical_finish_time: final date for data extraction
# example of format for historical_start_time and historical_finish_time: 'Fri Jul 05 00:00:00 +0000 2019'.

def get_historical_df(historical_start_time, historical_finish_time):
    print("Range for collected data (history): ", historical_start_time, historical_finish_time)
    
    df = spark.read.csv(historical_tweets_data, inferSchema=True, header=True)
    # remove records with no date
    df = df.na.drop(subset=["created_at"])
    
    # convert string to desired date format
    from datetime import datetime
    from pyspark.sql.functions import col, udf
    from pyspark.sql.types import DateType, TimestampType

    func =  udf (lambda x: str_tweet_to_datetime(x), TimestampType())

    df = df.withColumn('created_at', func(col('created_at')))

    selected_history = df.filter((df.created_at > historical_start_time) & (df.created_at < historical_finish_time))

    return selected_history

In [None]:
selected_df = get_historical_df(historical_start_time = times[0], historical_finish_time = times[1])

assert selected_df != None, "Something goes wrong with selecting data from recent data/history data"

selected_df.count()

# Tweets preprocessing

Text cleaning is crucial for any text modelling process, especially for topic modelling. In our case it consists from those steps:  
1) Lowercase all words  
2) Filter words with non-letters at the beginning (mainly for mentions, e.g. "@some_user")  
3) Filter http/https  
4) Filter all non-letters (crucial to remove emoji)  
5) Remove multiply whitespaces  
6) Remove repeated chars (e.g. "greeeeat" -> "great")

In [None]:
df = selected_df

In [None]:
tokenizer = nltk.WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()
stop_word_list = nltk.corpus.stopwords.words('english')

In [None]:
# filter nans
df = df.rdd.filter(lambda x: x[0] != None and x[1] != None and x[2] != None and x[4] != None)

# filter out channels not to consider
df = df.filter(lambda x: x[4] not in channels_not_to_consider)

# filter by country
df = df.filter(lambda x: x[1] in 'US')

# filter by precise location
if get_from_location:
    df = df.filter(lambda x: x[2] in locations_to_consider)

# filter tweet itself
df = df.filter(lambda x: filter_tweet(x[0], channels_not_to_consider=channels_not_to_consider))

# process tweet
df = df.map(lambda x: process_tweet(x[0]))

# final preprocesssing
df = df.filter(lambda x: len(x) > 0)

# make dataframes great again
df = df.map(lambda x: [x])

# schema for df
schema = StructType([StructField('tokens', ArrayType(StringType()), True)])
df = df.toDF(schema=schema)

In [None]:
df.show(10)

In [None]:
df.count()

# Topic modeling/Latent Dirichlet allocation(LDA)

In [None]:
#### CountVectorizer

In [None]:
print(time.strftime('%m%d%Y %H:%M:%S'))

cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", vocabSize=10000, minDF=2.0)
cvmodel = cv.fit(df)

print(time.strftime('%m%d%Y %H:%M:%S'))

In [None]:
print(time.strftime('%m%d%Y %H:%M:%S'))
df = cvmodel.transform(df)
print(time.strftime('%m%d%Y %H:%M:%S'))

In [None]:
#### TF-IDF

In [None]:
idf = IDF(inputCol="raw_features", outputCol="tf_idf_features", minDocFreq=2)
idfModel = idf.fit(df)

df = idfModel.transform(df)


In [None]:
df.show(10, True)

In [None]:
#### Adding id field

In [None]:
w = Window().orderBy(column("tokens"))
df = df.withColumn("id", row_number().over(w))

In [None]:
df.show(10, True)

In [None]:
rs = df.rdd.map(lambda x: (x[3], oldVectors.fromML(x[2])))
rs_df = rs.toDF()

#rs_df.show(10, False)

In [None]:
#### Run the LDA Topic Modeler

In [None]:
# Note the time before and after is printed in order to find out how much time it takes to process x number of records

print(time.strftime('%m%d%Y %H:%M:%S'))
lda_model = LDA.train(rs_df['_1', '_2'].rdd.map(list), k=num_of_topics_LDA, maxIterations=max_iterations_LDA)
print(time.strftime('%m%d%Y %H:%M:%S'))

In [None]:
print(time.strftime('%m%d%Y %H:%M:%S'))
topics = lda_model.topicsMatrix()
vocabArray = cvmodel.vocabulary

In [None]:
#wordNumbers = 15

topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = number_of_words_per_topic))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    prob = topic[1]
    
    result = []
    for i in range(number_of_words_per_topic):
        term = str(round(prob[i],3))+"  "+vocabArray[terms[i]]
        result.append(term)
    return result
print(time.strftime('%m%d%Y %H:%M:%S'))

In [None]:
print(time.strftime('%m%d%Y %H:%M:%S'))
topics_final = topicIndices.map(lambda topic:topic_render(topic)).collect()
print(time.strftime('%m%d%Y %H:%M:%S'))

## Topics

In [None]:
# based on the simple vectors(+number of words)

for topic in range(len(topics_final)):
    print ("Topic #" + str(topic+1) + "")
    for term in topics_final[topic]:
        print (term)
    print ('\n')

### Hot topics in the USA from [Google trends](https://trends.google.com/trends/explore?geo=US)

In [None]:
start_date = frame_start_datetime #str_tweet_to_datetime(frame_start_datetime)
finish_date = frame_finish_datetime #str_tweet_to_datetime(frame_finish_datetime)

In [None]:
google_trends_topics, google_trends_queries = get_google_trends_by_geo(geo) 

##### Google trends search queries

In [None]:
interesting_google_topics = google_trends_topics.filter(
    (google_trends_topics.Date >= start_date) & (google_trends_topics.Date <= finish_date))

In [None]:
print_google_trend_title(start_date, finish_date, "Search topics")
interest_google_topics = convert_datetime_in_interesting_google(interesting_google_topics)
interest_google_topics.select("Date","Search topics - rising", "Search topics - top").show(num_of_top_interest, False)

In case when timeframe is more than 1 day, filter correctly this google-trends

In [None]:
# interesing_google_topics_unique= unique_google_trends_by_time_frame(interesting_google_topics)
# print_google_trend_title(start_date, finish_date, "Search topics")
# interesing_google_topics_unique.select("Search topics - rising", "Search topics - top").show(num_of_top_interest, False)

##### Google trends search queries

In [None]:
interesting_google_queries = google_trends_queries.filter(
    (google_trends_queries.Date >= start_date) & (google_trends_queries.Date <= finish_date))

In [None]:
interesing_google_queries_unique= unique_google_trends_by_time_frame(interesting_google_queries)
print_google_trend_title(start_date, finish_date, "Search queries")
interesing_google_queries_unique.show(num_of_top_interest, False)

In [None]:
# print_google_trend_title(start_date, finish_date, "Search queries")
# interest_google_queries = convert_datetime_in_interesting_google(interesting_google_queries)
# interest_google_queries.select("Date", "Search queries - rising", "Search queries - top").show(num_of_top_interest, False)

#### Hot topics - google trends (directly) (probably this will be removed)

In [None]:
start_date_str = start_date.strftime("%Y-%m-%d")
finish_date_str = finish_date.strftime("%Y-%m-%d")
pytrend = TrendReq()
pytrend.build_payload(kw_list=[' '], geo=geo, timeframe=f"{start_date_str} {finish_date_str}")

##### Search topics

In [None]:
topics_df = pytrend.related_top_search_topics(spark)

In [None]:
print_google_trend_title(start_date, finish_date, "Search topics")
topics_df.select("Search topics - rising", "Search topics - top").show(num_of_top_interest, False)

##### Search queries

In [None]:
queries_df = pytrend.related_top_search_queries(spark)

In [None]:
print_google_trend_title(start_date, finish_date, "Search queries")
queries_df.show(num_of_top_interest, False)