# Import packages

In [1]:
import os
import re
import time
import requests
import numpy as np
import pandas as pd

# Update local stock list

In [2]:
def update_stocklist_data():
    ''' Functions for update stocklist data
    Source: www.nasdaq.com
    '''
    # create stock_list data folder
    folder = os.getcwd() + '\\stock_list\\'
    if not os.path.exists(folder):
        os.makedirs(folder)

    # soure url
    url = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=%s&render=download'

    # available exhanges
    exchange = ['nasdaq', 'nyse', 'amex']

    for exchg in exchange:
        resp = requests.get(url%exchg)
        with open(folder + '%s.xlsx'%exchg, 'wb') as output:
            output.write(resp.content)
    pass

# update stock_list
update_stocklist_data()

# Read local stock list and clean data

In [3]:
# data folder
folder = os.getcwd() + '\\stock_list\\'

# file names
files = os.listdir( folder )

files

['amex.xlsx', 'nasdaq.xlsx', 'nyse.xlsx']

In [4]:
stolis_df_list = []
for f in files:
    df = pd.read_csv( folder + f )
    stolis_df_list.append(df)
    print(f.upper(),df.shape, '\n==================================================\n',
          df[['Name']].head() )
    print('==================================================\n')

AMEX.XLSX (309, 9) 
                                                 Name
0                            22nd Century Group, Inc
1              Aberdeen Asia-Pacific Income Fund Inc
2                 Aberdeen Australia Equity Fund Inc
3  Aberdeen Emerging Markets Equity Income Fund, ...
4                  Aberdeen Global Income Fund, Inc.

NASDAQ.XLSX (3447, 9) 
                                      Name
0                               111, Inc.
1  1347 Property Insurance Holdings, Inc.
2  1347 Property Insurance Holdings, Inc.
3                180 Degree Capital Corp.
4                 1-800 FLOWERS.COM, Inc.

NYSE.XLSX (3108, 9) 
                      Name
0  3D Systems Corporation
1              3M Company
2         500.com Limited
3             58.com Inc.
4                 8x8 Inc



In [5]:
# concatenate companies from three exhanges
stolis_df_ = pd.concat(stolis_df_list, axis = 0)

# drop out fund
stolis_df_ = stolis_df_[stolis_df_['industry'] == stolis_df_['industry']]

# drop dupplicated company names
stolis_df = stolis_df_.drop_duplicates(['Name']).reset_index(drop = True)

print('Total %s companies, unique %s companies.' % (stolis_df_.shape[0], stolis_df.shape[0]))

Total 5304 companies, unique 4793 companies.


# Prepare tweets data

In [6]:
os.listdir()

['.ipynb_checkpoints',
 'Data handle.ipynb',
 'data.csv',
 'event_selecting_logic.md',
 'stock_list',
 'tweets.txt',
 'tweets_data.csv',
 'tweets_data.xlsx',
 'word frequency.csv',
 'word_list.csv',
 'word_lst.csv']

In [7]:
# filename
filename = 'tweets.txt'

# read txt file
file = open(filename).read()

# convert json format to dataframe
data = pd.DataFrame(eval(file.replace('false', 'False').replace('true', 'True')))

# store tweets in excel
data.to_excel('data.xlsx')

In [8]:
def is_in(string, str_lst = ['data'], lower = True):
    ''' Detect whether words in *str_lst* exist in *string* or not.
    Input:
    
    -- string: string for examing
               str format
               
    -- str_lst: a list of key words
                list of str
                default is *[' data ']*
    
    -- lower: determine whether capital letter is ignored or not, 
              True -> ignore capital letters, transform all string to lower case;
              Fasle -> capital letters can't be ignored, both in *string* and *str_lst*.
    
    Output:
    
    -- if any key words is detedted:
           return a string with all key words emphathized
           
       else:
           return numpy.nan
    '''
    string = ' %s '%string
    if lower:
        
        def my_lable(string, str_lst = str_lst):
            for s in str_lst:
                string = re.sub(r'(?<=\W)%s(?=\W)'%s.lower(), ' {%s} '%s.upper(), string )
            return string

        string = string.lower()

        if my_lable(string, str_lst = str_lst) != string:
            return my_lable(string)

        else:
            return np.nan
        
    else:
        
        def my_lable(string, str_lst = str_lst):
            for s in str_lst:
                string = re.sub(r'(?<=\W)%s(?=\W)'%s, ' {%s} '%s.upper(), string )
            return string

        string = string

        if my_lable(string, str_lst = str_lst) != string:
            return my_lable(string)

        else:
            return np.nan

In [9]:
def count_pattern(string, dict_):
    pattern = re.compile(r'{ (.*?) }', re.S)
    items = re.findall(pattern, string)
    for i in items:
        try:
            dict_[i] += 1.
        except:
            dict_[i] = 1.
    return dict_

In [10]:
data.columns

Index(['created_at', 'favorite_count', 'id_str', 'is_retweet', 'retweet_count',
       'source', 'text'],
      dtype='object')

In [11]:
data['created_at'] = pd.to_datetime( data['created_at'])

In [12]:
data['text'] = data['text'].apply(lambda x: re.sub('https://\S+', '', x))

# Search STR_list

In [13]:
def search_str_lst(str_lst, print_ = True):
    assert type(str_lst) == list, 'Type of str_lst must be list!'
    i = 1
    dict_ = {}
    v_lst = data['text'].apply(is_in, str_lst= str_lst).dropna().values
    if print_:
        for v in v_lst:
            dict_ = count_pattern(v, dict_)
            print(v)
            print('================================================%s/%s\n'%( i, len(v_lst) ))
            i += 1
    return v_lst

In [14]:
v_lst = search_str_lst(['AGREE'])

 time magazine called to say that i was probably going to be named “man (person) of the year,” like last year, but i would have to  {AGREE}  to an interview and a major photo shoot. i said probably is no good and took a pass. thanks anyway! 

 i  {AGREE}  getting tax cuts approved  is important (we will also get healthcare), but perhaps no administration has done more in its first..... 

 thank you @geraldorivera @foxandfriends.  {AGREE} !  

 a great and important day at the united nations.met with leaders of many nations who  {AGREE}  with much (or all) of what i stated in my speech! 

 while all  {AGREE}  the u. s. president has the complete power to pardon, why think of that when only crime so far is leaks against us.fake news 

 we finally  {AGREE}  on something rosie.  

 "one of the most effective press conferences i've ever seen!" says rush limbaugh. many  {AGREE} .yet fake media  calls it differently! dishonest 

 interesting that certain middle-eastern countries  {AGREE}  wit

In [16]:
drop_lst = '''Company,Laboratories,Inc,plc,corp,group,equities,pharmaceutical,technology,plc,coporation,co,Resource,
networks,green,texas,holdings,Inc,properties,holdings,energy,communications,limited,solutions,resources,brands,SUMMIT,
hunt,companies,health,restrants,services,chemical,int,l,arts,resources,Holdings,Holding,Inc,Cos,Ltd,Corp,Co,plc,PEOPLE,
red,space,under,Cos,Group,properties,Corporation,Incorporated,tree,business,city,Residential,Company,TOTAL,one,aid,up,
line,gas,network,black,federal,union ,best,air,water,U,S,Trust,Arts,Communications,Chemical,Lifesciences,JUST,usa,
Technologies,Systems,General,First,Street,Southern,Networks,Realty,Service,Class,A,Materials,Class,Cruise,Line,180,
Services,Financial,Resources,NATIONAL ,Foods,Scientific,Beauty,Realty,Communications,com,Automotive,Stores,Mueller,
Technologies,International,WEST ,Markets,Machines,Sciences,Exchange,Tool,Works,Dynamics,Bank,Investment,limited,Simply,
Laboratories,NEWS ,technology,Resources,Resorts,equities,energy,health,Parts,brands,and,a,at,on,take,of,Church,forward,
system,new,UNITED,Republic,OIL ,real,york,the,AMERICAN, america,state,C,Data,SECURITY ,companies,restrants,PLANS,can,
Industries,Gold,Management,Education,REIT,Acquisition,Partners,LP,China,Hospitality,Medical,Capital,Royalty,world,
Electronics,39,Enterprisesde,Investors,Industrial,Power,Products,Property,Insurance,Finance,Life,Worldwide,Electric,
if,now,all,care,nation,by,do,mexico,it,in,me,CHECK,great,sports,good,golf,big,AGREE,GROWTH,north,world,korea,forward,
Software,Pacific,Global,Bio,Entertainment,Media,Community,Estate,LLC,Hotels,for,Cool,first,second,third,fifth,fourth,
Strong,Healthcare,Cohen,Standard,Star,Opportunity,Level,Plus,HealthCare,Georgia,Place,States,Information,800,Wisconsin,
Source,Mark,Public,Way,Manufacturing,Funding,Better,South,Carolina,James,Beyond,Stock,Private,Career,European,Point,
Children,Citizens,Clean,Center,Consumer,County,Old,Country,Credit,Journal,Dollar,Victory,TRADE,Montana,East,Focus,
Ever,Live,Payments,Washington,Stay,Farmers,Choice,Indiana,Foundation,US,Five,Prime,Full,House,Fusion,Future,Times,Fix,
Troy,Henry,Home,Hope,Building,Infrastructure,Support,Money,Japan,Control,John,Kelly,Defense,End,Smart,Marine,Merit,
Con,Modern,Mr,My,NICE,Office,Ohio,Steel,Stop,Open,Re,Patrick,Virginia,Points,Popular,Positive,Progress,Safe,Safety,
Special,Games,Florida,Number,SMART,Missouri,Spirit,Market,support,Price,Two,Long,Joint,Meet,Trade,Top,TOP,Tuesday,de,
Wins,500,Interest,Communities,Pittsburgh,Lots,Cia,Family,Build,Canada,Cars,Fair,Israel,Clear,well,40,,Fire,Champion,
Dr,Government,Mississippi,Far,Fortune,Host,Game,Las,Argentina,Europe,Party,Pennsylvania,Post,RE,Ready,Robert,San,Six,
Joe,Team,France,Tennessee,AS,Met,Waters,White,Therapeutics,CALIFORNIA,strong,Recovery,Pharmaceuticals,Morning,Harvard
'''.replace(' ', '').replace('\n', '').lower().split(',')

df = pd.read_excel('words_databse(1st_cleaned).xlsx')
drop_lst += df[df['drop'] == 0].index.tolist()


In [17]:
drop_set = {*drop_lst}

# Company name search

In [18]:
# split stock names
name_df = stolis_df[['Name']].copy()

name_df['words'] = pd.DataFrame( name_df['Name'].apply(lambda x: \
                                                        re.findall(r'(\w+)', x) ) )
name_df.head()

Unnamed: 0,Name,words
0,"22nd Century Group, Inc","[22nd, Century, Group, Inc]"
1,Acme United Corporation.,"[Acme, United, Corporation]"
2,"Actinium Pharmaceuticals, Inc.","[Actinium, Pharmaceuticals, Inc]"
3,"Adams Resources & Energy, Inc.","[Adams, Resources, Energy, Inc]"
4,AeroCentury Corp.,"[AeroCentury, Corp]"


In [19]:
def knock_out(lst, drop_lst = drop_set):
    ''' Knock out words in drop_set
    '''
    lst_ = []
    for i in lst:
        # if in drop_lst or length < 2, drop it
        if i.lower() in drop_lst or len(i)< 2:
            pass
        else:
            lst_.append(i)
            
    if len(lst_) > 0:
        return lst_
    
    else:
        return np.nan

name_df['dropped'] = name_df['words'].apply(knock_out)

In [20]:
dropped_companies = name_df[name_df['dropped'] != name_df['dropped']]
print('%s companies are dropped out from stock list.'%dropped_companies.shape[0])

214 companies are dropped out from stock list.


In [21]:
name_df = name_df.dropna()
print('%s companies are kept after dropping out.'%name_df.shape[0])

4579 companies are kept after dropping out.


# Re-arrange key-words and companies

In [22]:
name_df.head()

Unnamed: 0,Name,words,dropped
0,"22nd Century Group, Inc","[22nd, Century, Group, Inc]",[Century]
1,Acme United Corporation.,"[Acme, United, Corporation]",[Acme]
2,"Actinium Pharmaceuticals, Inc.","[Actinium, Pharmaceuticals, Inc]",[Actinium]
3,"Adams Resources & Energy, Inc.","[Adams, Resources, Energy, Inc]",[Adams]
4,AeroCentury Corp.,"[AeroCentury, Corp]",[AeroCentury]


In [23]:
# Save all words

In [24]:
def save_words_dict_data():
    words_dict = {}
    for i in range( name_df.shape[0] ):
        words = name_df['dropped'].values[i]
        comp  = name_df['Name'   ].values[i]

        for w in words:
            try:
                words_dict[w] += '%s, '%comp
            except:
                words_dict[w] = ''
                words_dict[w] += '%s, '%comp
    pd.DataFrame(words_dict, index = [0]).T.to_excel('words_database.xlsx')
    pass
save_words_dict_data()

In [25]:
words_dict = {}
for i in range( name_df.shape[0] ):
    words = name_df['dropped'].values[i]
    comp  = name_df['Name'   ].values[i]
    
    for w in words:
        try:
            words_dict[w] += ['%s, '%comp]
        except:
            words_dict[w] = []
            words_dict[w] += ['%s, '%comp]

# Drop zero search frequency key words

In [26]:
i = 0
t = time.time()
k_lst = words_dict.copy().keys()
for k in k_lst:
    if len(search_str_lst([k], False)) == 0:
        words_dict.pop(k)
        
    i += 1
    T = time.time() - t
    v = T/i
    Tt = len(k_lst) * v

    if i % 100 == 0:
        print('%.4f sec(s) elapsed,'%T, '%.4f sec(s) left,'%(Tt - T), 'total %.4f sec(s)'%Tt )

4.1599 sec(s) elapsed, 191.1893 sec(s) left, total 195.3492 sec(s)
7.8869 sec(s) elapsed, 177.2983 sec(s) left, total 185.1852 sec(s)
11.7234 sec(s) elapsed, 171.7872 sec(s) left, total 183.5106 sec(s)
15.9262 sec(s) elapsed, 171.0472 sec(s) left, total 186.9733 sec(s)
19.6449 sec(s) elapsed, 164.8602 sec(s) left, total 184.5051 sec(s)
23.5924 sec(s) elapsed, 161.0572 sec(s) left, total 184.6496 sec(s)
27.5600 sec(s) elapsed, 157.3282 sec(s) left, total 184.8882 sec(s)
31.3700 sec(s) elapsed, 152.7717 sec(s) left, total 184.1416 sec(s)
35.1512 sec(s) elapsed, 148.2600 sec(s) left, total 183.4112 sec(s)
38.7487 sec(s) elapsed, 143.2151 sec(s) left, total 181.9638 sec(s)
42.5058 sec(s) elapsed, 138.9552 sec(s) left, total 181.4610 sec(s)
46.8651 sec(s) elapsed, 136.5338 sec(s) left, total 183.3989 sec(s)
50.9978 sec(s) elapsed, 133.2219 sec(s) left, total 184.2196 sec(s)
54.9952 sec(s) elapsed, 129.4743 sec(s) left, total 184.4695 sec(s)
58.6670 sec(s) elapsed, 124.9999 sec(s) left, tota

In [27]:
len(words_dict.keys())

426

# Find mentioned companies and mark key words out in tweets

In [28]:
i = 0
t = time.time()
def find_comp(string, words_dict = words_dict.copy(), lower = True,):
    ''' Find companies
    '''
    #====================================================
    string = ' %s '%string.lower()
        
    def my_label(string, str_lst):
        for s in str_lst:
            string = re.sub(r'(?<=\W)%s(?=\W)'%s.lower(), ' {%s} '%s.upper(), string )
        return string
    
    str_lst  = []
    comp_lst = []
    
    for k in words_dict.keys():
        
        comp = words_dict[k]
        
        if lower:
            k = k.lower()
            
            if  string != my_label(string, str_lst = [k] ):
                str_lst  += [k]
                
                comp_lst += comp
            
        else:
            if  string != my_label(string, str_lst = [k] ):
                str_lst  += [k]
                
                comp_lst += comp
                
    global i
    i += 1
    T = time.time() - t
    v = T/i
    Tt = data['text'].shape[0] * v
    
    if i % 100 == 0:
        print('%.4f sec(s) elapsed,'%T, '%.4f sec(s) left,'%(Tt - T), 'total %.4f sec(s)'%Tt )
    
    if len(str_lst) > 0:
        return my_label(string, str_lst = {*str_lst}), ''.join( list( {*comp_lst} ) )
    
    else:
        return np.nan, np.nan

In [29]:
res = data['text'].apply(lambda x: pd.Series( find_comp(x) ) )

0.7344 sec(s) elapsed, 18.3522 sec(s) left, total 19.0866 sec(s)
1.3594 sec(s) elapsed, 16.3057 sec(s) left, total 17.6651 sec(s)
1.9531 sec(s) elapsed, 14.9674 sec(s) left, total 16.9206 sec(s)
2.4688 sec(s) elapsed, 13.5720 sec(s) left, total 16.0408 sec(s)
2.9531 sec(s) elapsed, 12.3972 sec(s) left, total 15.3504 sec(s)
3.4063 sec(s) elapsed, 11.3485 sec(s) left, total 14.7548 sec(s)
3.8750 sec(s) elapsed, 10.5123 sec(s) left, total 14.3873 sec(s)
4.3750 sec(s) elapsed, 9.8383 sec(s) left, total 14.2133 sec(s)
4.8594 sec(s) elapsed, 9.1734 sec(s) left, total 14.0328 sec(s)
5.3281 sec(s) elapsed, 8.5197 sec(s) left, total 13.8478 sec(s)
5.8281 sec(s) elapsed, 7.9421 sec(s) left, total 13.7703 sec(s)
6.2969 sec(s) elapsed, 7.3411 sec(s) left, total 13.6380 sec(s)
6.8125 sec(s) elapsed, 6.8073 sec(s) left, total 13.6198 sec(s)
7.3058 sec(s) elapsed, 6.2569 sec(s) left, total 13.5627 sec(s)
7.7745 sec(s) elapsed, 5.6961 sec(s) left, total 13.4707 sec(s)
8.2471 sec(s) elapsed, 5.1493 sec

In [30]:
res.columns = ['tweets', 'companies']

In [31]:
result = pd.concat([res, data['created_at']], axis = 1).dropna().reset_index(drop = True)

In [32]:
result.to_excel('marked_tweets.xlsx')