
# Google Colab Search Engine UI 
---


In [25]:
import pandas as pd
import re
import string
import math

import matplotlib.pyplot as plt
import emoji

In [9]:
from textblob import TextBlob

---
### Importing Elastic Search
---

[K     |████████████████████████████████| 185 kB 8.9 MB/s eta 0:00:01
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [2]:
!pip install emoji -q
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0
!pip install elasticsearch -q

[K     |████████████████████████████████| 356 kB 6.9 MB/s eta 0:00:01
[?25h

In [4]:
# let's import ES
from elasticsearch import Elasticsearch
# start es server
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

---
### Importing the Dataset
---

In [10]:
# Importing the reduced data

df = pd.read_csv('stockerbot-export(Verified Tweets).csv',error_bad_lines=False)
df.head()



Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1.0197e+18,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True
1,1.01971e+18,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True
2,1.01971e+18,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1.01972e+18,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True
4,1.01972e+18,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True


In [12]:
df2 = df.copy(deep=True)

df2['tokens'] = df2['text'] 
df2.tokens = df2.tokens.str.lower()
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'{link}', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r"\[video\]", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'[@$#]+', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))  #Removed Chinese symbols



In [13]:
# Moving cleaned tweets to a sepearte dataframe
tweets = df2
tweets.head()
from datetime import datetime

tweets['date'] = pd.to_datetime(tweets['timestamp']).dt.date
tweets['timestamp'] = pd.to_datetime(tweets['timestamp']).dt.time
tweets = tweets.rename(columns = {"timestamp":"time"})
tweets

Unnamed: 0,id,text,time,source,symbols,company_names,url,verified,tokens,date
0,1.019700e+18,VIDEO: “I was in my office. I was minding my o...,21:33:26,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,video: i was in my office. i was minding my o...,2018-07-18
1,1.019710e+18,The price of lumber $LB_F is down 22% since hi...,22:22:47,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,the price of lumber lb_f is down 22% since hit...,2018-07-18
2,1.019710e+18,Who says the American Dream is dead? https://t...,22:32:01,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True,who says the american dream is dead?,2018-07-18
3,1.019720e+18,Barry Silbert is extremely optimistic on bitco...,22:52:52,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True,barry silbert is extremely optimistic on bitco...,2018-07-18
4,1.019720e+18,How satellites avoid attacks and space junk wh...,23:00:01,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True,how satellites avoid attacks and space junk wh...,2018-07-18
...,...,...,...,...,...,...,...,...,...,...
358,1.019730e+18,RT @LauraJKeller: See ya'll one more time toni...,23:26:46,HaidiLun,JPM,JPMorgan Chase & Co.,,True,rt laurajkeller: see ya'll one more time tonig...,2018-07-18
359,1.019730e+18,$SAN did not have a good quarter and it was su...,23:28:04,jimcramer,SAN,Banco Santander,https://twitter.com/Marineteam21/status/101971...,True,san did not have a good quarter and it was sur...,2018-07-18
360,1.019730e+18,$AMAT bulls score big overnight https://t.co/C...,23:37:10,petenajarian,AMAT,Applied Materials,http://zpr.io/6XWjS,True,amat bulls score big overnight via investitute,2018-07-18
361,1.019730e+18,$AMAT bulls score big overnight https://t.co/O...,23:38:58,jonnajarian,AMAT,Applied Materials,http://zpr.io/6XWjS,True,amat bulls score big overnight via investitute,2018-07-18


In [15]:
# get tweet subjectivity 
def tweet_subjectivity(tweets):
    return TextBlob(tweets).sentiment.subjectivity

# get tweet polarity
def tweet_polarity(tweets):
    return TextBlob(tweets).sentiment.polarity

#Add sentiment
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
tweets['Subjectivity'] = tweets['tokens'].apply(tweet_subjectivity)

tweets['Polarity'] = tweets['tokens'].apply(tweet_polarity)
  
tweets['Sentiment'] = tweets['Polarity'].apply(getSentiment)

print("\nNumber of tweets in the dataset: ",len(tweets))
tweets.head()


Number of tweets in the dataset:  363


Unnamed: 0,id,text,time,source,symbols,company_names,url,verified,tokens,date,Subjectivity,Polarity,Sentiment
0,1.0197e+18,VIDEO: “I was in my office. I was minding my o...,21:33:26,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,video: i was in my office. i was minding my o...,2018-07-18,1.0,0.6,Positive
1,1.01971e+18,The price of lumber $LB_F is down 22% since hi...,22:22:47,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,the price of lumber lb_f is down 22% since hit...,2018-07-18,0.288889,-0.155556,Negative
2,1.01971e+18,Who says the American Dream is dead? https://t...,22:32:01,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True,who says the american dream is dead?,2018-07-18,0.2,-0.1,Negative
3,1.01972e+18,Barry Silbert is extremely optimistic on bitco...,22:52:52,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True,barry silbert is extremely optimistic on bitco...,2018-07-18,0.727273,0.005682,Positive
4,1.01972e+18,How satellites avoid attacks and space junk wh...,23:00:01,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True,how satellites avoid attacks and space junk wh...,2018-07-18,0.0,0.0,Neutral


# Query or Search

In [140]:
l = []
for index, row in tweets.iterrows():
  x = index, row['source'], row['text'], row['symbols'], row['date'], row['time'].isoformat()
  l.append(x)

In [142]:
# mappings are used to define what kind of structure your data has. here explicit mapping is used: 
# https://www.elastic.co/guide/en/elasticsearch/reference/current/explicit-mapping.html

# The mapping is used when creating the index through the request body:

request_body = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        
    },
    'mappings': {
          'properties': {
              'doc_id': {'type': 'integer'},
              'source': {'type': 'text'},
              'text': {'type': 'text'},
              'symbol': {'type': 'text'},
              'date': {'type': 'date'},
              'time' : {'type': 'date'},



          }
    }
}

index_name = 'test'
try:
  es.indices.get(index_name)
  print('index {} already exists'.format(index_name))
except:
  print('creating index {}'.format(index_name))
  es.indices.create(index_name, body=request_body)

index test already exists


In [124]:
for doc_id, source, text, symbol, date, time, company in l:
  doc_body = {
      'doc_id': doc_id,
      'source': source,
      'text': text,
      'symbol': symbol,
      'date' : date,
      'time' : time,

  }
  es.index(index_name, doc_body)


In [169]:
def search(index_name, query_body):
  # return only doc_id and rank
  results = es.search(index=index_name, body=query_body, explain=False)
  plain_results = [(x['_source']['doc_id'], x['_source']['source'],
                    x['_source']['text'], x['_source']['symbol'], 
                    x['_source']['date'],x['_source']['time'] ) for x in results['hits']['hits']]
  return results, plain_results

In [168]:
timestamp = "" #@param {type:"date"}
twitter = "GoldmanSachs" #@param {type: "string"} 
keyword = "" #@param {type: "string"} 

query_body = {
    "query": {
        "bool": {
            "should": [   #change to must, if you want to create a specific search or should
                {"match": {"date" : timestamp}},
                {"match": {"source" : twitter}},
                {"match": {"text" : keyword}},
            ]
        }
    }
}


#print results
results, plain_results = search(index_name, query_body)
r = pd.DataFrame(plain_results)
if len(plain_results) > 0:
  r.columns =['doc_id', 'source', 'text', 'symbol', 'date', 'time']
  print(r.to_markdown(tablefmt='grid'))
else:
  print("There are no results for these querys")



+----+----------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------+------------+----------+
|    |   doc_id | source       | text                                                                                                                                         | symbol   | date       | time     |
|  0 |        0 | GoldmanSachs | VIDEO: “I was in my office. I was minding my own business...” –David Solomon tells $GS interns how he learned he wa… https://t.co/QClAITywXV | GS       | 2018-07-18 | 21:33:26 |
+----+----------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------+------------+----------+
|  1 |        0 | GoldmanSachs | VIDEO: “I was in my office. I was minding my own business...” –David Solomon tells $GS interns how he learned he wa… https: