# Import packages

In [1]:
import os
import re
import time
import requests
import numpy as np
import pandas as pd

# Update local stock list

In [2]:
def update_stocklist_data():
    ''' Functions for update stocklist data
    Source: www.nasdaq.com
    '''
    # create stock_list data folder
    folder = os.getcwd() + '\\stock_list\\'
    if not os.path.exists(folder):
        os.makedirs(folder)

    # soure url
    url = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=%s&render=download'

    # available exhanges
    exchange = ['nasdaq', 'nyse', 'amex']

    for exchg in exchange:
        resp = requests.get(url%exchg)
        with open(folder + '%s.xlsx'%exchg, 'wb') as output:
            output.write(resp.content)
    pass

# update stock_list
update_stocklist_data()

# Read local stock list and clean data

In [3]:
# data folder
folder = os.getcwd() + '\\stock_list\\'

# file names
files = os.listdir( folder )

files

['amex.xls', 'nasdaq.xls', 'nyse.xls']

In [4]:
stolis_df_list = []
for f in files:
    df = pd.read_csv( folder + f )
    stolis_df_list.append(df)
    print(f.upper(),df.shape, '\n==================================================\n',
          df[['Name']].head() )
    print('==================================================\n')

AMEX.XLS (309, 9) 
                                                 Name
0                            22nd Century Group, Inc
1              Aberdeen Asia-Pacific Income Fund Inc
2                 Aberdeen Australia Equity Fund Inc
3  Aberdeen Emerging Markets Equity Income Fund, ...
4                  Aberdeen Global Income Fund, Inc.

NASDAQ.XLS (3450, 9) 
                                      Name
0                               111, Inc.
1  1347 Property Insurance Holdings, Inc.
2  1347 Property Insurance Holdings, Inc.
3                180 Degree Capital Corp.
4                 1-800 FLOWERS.COM, Inc.

NYSE.XLS (3104, 9) 
                      Name
0  3D Systems Corporation
1              3M Company
2         500.com Limited
3             58.com Inc.
4                 8x8 Inc



In [5]:
# concatenate companies from three exhanges
stolis_df_ = pd.concat(stolis_df_list, axis = 0)

# drop dupplicated company names
stolis_df = stolis_df_.drop_duplicates(['Name']).reset_index(drop = True)

print('Total %s companies, unique %s companies.' % (stolis_df_.shape[0], stolis_df.shape[0]))

Total 6863 companies, unique 5770 companies.


# Prepare tweets data

In [6]:
os.listdir()

['.ipynb_checkpoints',
 'Data handle.ipynb',
 'data.csv',
 'data.xlsx',
 'event_selecting_logic.md',
 'nasdaq.xls',
 'stock_list',
 'tweets.txt',
 'tweets_data.csv',
 'tweets_data.xlsx',
 'word frequency.csv',
 'words_database.xlsx',
 'word_list.csv',
 'word_lst.csv',
 '~$words_database.xlsx']

In [7]:
# filename
filename = 'tweets.txt'

# read txt file
file = open(filename).read()

# convert json format to dataframe
data = pd.DataFrame(eval(file.replace('false', 'False').replace('true', 'True')))

# store tweets in excel
data.to_excel('data.xlsx')

In [8]:
def is_in(string, str_lst = ['data'], lower = True):
    ''' Detect whether words in *str_lst* exist in *string* or not.
    Input:
    
    -- string: string for examing
               str format
               
    -- str_lst: a list of key words
                list of str
                default is *[' data ']*
    
    -- lower: determine whether capital letter is ignored or not, 
              True -> ignore capital letters, transform all string to lower case;
              Fasle -> capital letters can't be ignored, both in *string* and *str_lst*.
    
    Output:
    
    -- if any key words is detedted:
           return a string with all key words emphathized
           
       else:
           return numpy.nan
    '''
    string = ' %s '%string
    if lower:
        
        def my_lable(string, str_lst = str_lst):
            for s in str_lst:
                string = re.sub(r'(?<=\W)%s(?=\W)'%s.lower(), ' {%s} '%s.upper(), string )
            return string

        string = string.lower()

        if any( [(i.lower() in string) for i in str_lst ] ):
            return my_lable(string)

        else:
            return np.nan
        
    else:
        
        def my_lable(string, str_lst = str_lst):
            for s in str_lst:
                string = re.sub(r'(?<=\W)%s(?=\W)'%s, ' {%s} '%s.upper(), string )
            return string

        string = string

        if any( [(i in string) for i in str_lst ] ):
            return my_lable(string)

        else:
            return np.nan

In [9]:
def count_pattern(string, dict_):
    pattern = re.compile(r'{ (.*?) }', re.S)
    items = re.findall(pattern, string)
    for i in items:
        try:
            dict_[i] += 1.
        except:
            dict_[i] = 1.
    return dict_

In [10]:
data.columns

Index(['created_at', 'favorite_count', 'id_str', 'is_retweet', 'retweet_count',
       'source', 'text'],
      dtype='object')

In [11]:
data['created_at'] = pd.to_datetime( data['created_at'])

# Search STR_list

In [29]:
def search_str_lst(str_lst):
    assert type(str_lst) == list, 'Type of str_lst must be list!'
    i = 1
    dict_ = {}
    v_lst = data['text'].apply(is_in, str_lst= str_lst).dropna().values
    for v in v_lst:
        dict_ = count_pattern(v, dict_)
        print(v)
        print('================================================%s/%s\n'%( i, len(v_lst) ))
        i += 1
    pass

In [30]:
search_str_lst(['data'])

 “outrageous, it’s the adam schiff problem. people abusing the access to classified  {DATA}  to then go out in public and make allegations that didn’t prove to be true. you look at a decision to essentially investigate a political rival. who made it?” james freeman, @wsj 

 cbs reports that in the roger stone indictment,  {DATA}  was “released during the 2016 election to damage hillary clinton.” oh really! what about the fake and unverified “dossier,” a total phony conjob, that was paid for by crooked hillary to damage me and the trump campaign? what... 

 many mostly democrat states refused to hand over  {DATA}  from the 2016 election to the commission on voter fraud. they fought hard that the commission not see their records or methods because they know that many people are voting illegally. system is rigged, must go to voter i.d. 



In [13]:
drop_lst = '''Company,Laboratories,Inc,plc,corp,group,equities,pharmaceutical,technology,plc,coporation,co,
networks,green,texas,holdings,Inc,properties,holdings,energy,communications,limited,solutions,resources,brands,
hunt,companies,health,restrants,services,chemical,int,l,arts,resources,Holdings,Holding,Inc,Cos,Ltd,Corp,Co,plc,
red,space,under,Cos,Group,properties,Corporation,Incorporated,tree,business,city,Residential,Company,TOTAL,one,
line,gas,network,black,federal,union ,best,air,water,U,S,Trust,Arts,Communications,Chemical,Lifesciences,
Technologies,Systems,General,First,Street,Southern,Networks,Realty,Service,Class,A,Materials,Class,Cruise,Line,
Services,Financial,Resources,NATIONAL ,Foods,Scientific,Beauty,Realty,Communications,com,Automotive,Stores,
Technologies,International,WEST ,Markets,Machines,Sciences,Exchange,Tool,Works,Dynamics,Bank,Investment,limited,
Laboratories,NEWS ,technology,Resources,Resorts,equities,energy,health,Parts,brands,and,a,at,on,take,of,Church,
system,new,UNITED,Republic,OIL ,real,york,the,AMERICAN, america,state,C,Data,SECURITY ,companies,restrants
'''.replace(' ', '').replace('\n', '').lower().split(',')

In [14]:
drop_set = {*drop_lst}

# Company name search

In [15]:
# split stock names
name_df = stolis_df[['Name']].copy()

name_df['words'] = pd.DataFrame( name_df['Name'].apply(lambda x: \
                                                        re.findall(r'(\w+)', x) ) )
name_df.head()

Unnamed: 0,Name,words
0,"22nd Century Group, Inc","[22nd, Century, Group, Inc]"
1,Aberdeen Asia-Pacific Income Fund Inc,"[Aberdeen, Asia, Pacific, Income, Fund, Inc]"
2,Aberdeen Australia Equity Fund Inc,"[Aberdeen, Australia, Equity, Fund, Inc]"
3,"Aberdeen Emerging Markets Equity Income Fund, ...","[Aberdeen, Emerging, Markets, Equity, Income, ..."
4,"Aberdeen Global Income Fund, Inc.","[Aberdeen, Global, Income, Fund, Inc]"


In [16]:
def knock_out(lst, drop_lst = drop_set):
    ''' Knock out words in drop_set
    '''
    lst_ = []
    for i in lst:
        # if in drop_lst or length < 2, drop it
        if i.lower() in drop_lst or len(i)< 2:
            pass
        else:
            lst_.append(i)
            
    if len(lst_) > 0:
        return lst_
    
    else:
        return np.nan

name_df['dropped'] = name_df['words'].apply(knock_out)

In [17]:
dropped_companies = name_df[name_df['dropped'] != name_df['dropped']]
print('%s companies are dropped out from stock list.'%dropped_companies.shape[0])

58 companies are dropped out from stock list.


In [18]:
name_df = name_df.dropna()
print('%s companies are kept after dropping out.'%name_df.shape[0])

5712 companies are kept after dropping out.


# Re-arrange key-words and companies

In [19]:
name_df.head()

Unnamed: 0,Name,words,dropped
0,"22nd Century Group, Inc","[22nd, Century, Group, Inc]","[22nd, Century]"
1,Aberdeen Asia-Pacific Income Fund Inc,"[Aberdeen, Asia, Pacific, Income, Fund, Inc]","[Aberdeen, Asia, Pacific, Income, Fund]"
2,Aberdeen Australia Equity Fund Inc,"[Aberdeen, Australia, Equity, Fund, Inc]","[Aberdeen, Australia, Equity, Fund]"
3,"Aberdeen Emerging Markets Equity Income Fund, ...","[Aberdeen, Emerging, Markets, Equity, Income, ...","[Aberdeen, Emerging, Equity, Income, Fund]"
4,"Aberdeen Global Income Fund, Inc.","[Aberdeen, Global, Income, Fund, Inc]","[Aberdeen, Global, Income, Fund]"


In [20]:
# Save all words

In [23]:
def save_words_dict_data():
    words_dict = {}
    for i in range( name_df.shape[0] ):
        words = name_df['dropped'].values[i]
        comp  = name_df['Name'   ].values[i]

        for w in words:
            try:
                words_dict[w] += '%s, '%comp
            except:
                words_dict[w] = ''
                words_dict[w] += '%s, '%comp
    pd.DataFrame(words_dict, index = [0]).T.to_excel('words_database.xlsx')
    pass
save_words_dict_data()

In [24]:
words_dict = {}
for i in range( name_df.shape[0] ):
    words = name_df['dropped'].values[i]
    comp  = name_df['Name'   ].values[i]
    
    for w in words:
        try:
            words_dict[w] += ['%s, '%comp]
        except:
            words_dict[w] = []
            words_dict[w] += ['%s, '%comp]

In [25]:
words_dict

{'22nd': ['22nd Century Group, Inc, '],
 'Century': ['22nd Century Group, Inc, ',
  'Century Aluminum Company, ',
  'Century Bancorp, Inc., ',
  'Century Casinos, Inc., ',
  'Century Communities, Inc., '],
 'Aberdeen': ['Aberdeen Asia-Pacific Income Fund Inc, ',
  'Aberdeen Australia Equity Fund Inc, ',
  'Aberdeen Emerging Markets Equity Income Fund, Inc., ',
  'Aberdeen Global Income Fund, Inc., ',
  'Aberdeen Global Dynamic Dividend Fund, ',
  'Aberdeen Global Premier Properties Fund, ',
  'Aberdeen Income Credit Strategies Fund, ',
  'Aberdeen Japan Equity Fund, Inc. , ',
  'Aberdeen Total Dynamic Dividend Fund, ',
  'First Trust/Aberdeen Emerging Opportunity Fund, ',
  'First Trust/Aberdeen Global Opportunity Income Fund, '],
 'Asia': ['Aberdeen Asia-Pacific Income Fund Inc, ',
  'Asia Pacific Wire & Cable Corporation Limited, ',
  'First Trust Asia Pacific Ex-Japan AlphaDEX Fund, ',
  'First Trust RiverFront Dynamic Asia Pacific ETF, ',
  'iClick Interactive Asia Group Limited, '

In [26]:
i = 0
t = time.time()
def find_comp(string, words_dict = words_dict.copy(), lower = True,):
    ''' Find companies
    '''
    #====================================================
    string = ' %s '%string.lower()
        
    def my_label(string, str_lst):
        for s in str_lst:
            string = re.sub(r'(?<=\W)%s(?=\W)'%s.lower(), ' {%s} '%s.upper(), string )
        return string
    
    str_lst  = []
    comp_lst = []
    
    for k in words_dict.keys():
        
        comp = words_dict[k]
        
        if lower:
            k = k.lower()
            
            if  string != my_label(string, str_lst = [k] ):
                str_lst  += [k]
                
                comp_lst += comp
            
        else:
            if  string != my_label(string, str_lst = [k] ):
                str_lst  += [k]
                
                comp_lst += comp
                
    global i
    i += 1
    T = time.time() - t
    v = T/i
    Tt = data['text'].shape[0] * v
    
    if i % 10 == 0:
        print('% secs elapsed,'%T, '%s sec(s) left,'%(Tt - T), 'total %s sec(s)'%Tt )
    
    if len(str_lst) > 0:
        return my_label(string, str_lst = {*str_lst}), ''.join( list( {*comp_lst} ) )
    
    else:
        return np.nan, np.nan

res = data['text'].apply(lambda x: pd.Series( find_comp(x) ) )