# Import packages

In [1]:
import os
import re
import time
import requests
import numpy as np
import pandas as pd

# Update local stock list

In [2]:
def update_stocklist_data():
    ''' Functions for update stocklist data
    Source: www.nasdaq.com
    '''
    # create stock_list data folder
    folder = os.getcwd() + '\\stock_list\\'
    if not os.path.exists(folder):
        os.makedirs(folder)

    # soure url
    url = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=%s&render=download'

    # available exhanges
    exchange = ['nasdaq', 'nyse', 'amex']

    for exchg in exchange:
        resp = requests.get(url%exchg)
        with open(folder + '%s.xlsx'%exchg, 'wb') as output:
            output.write(resp.content)
    pass

# update stock_list
update_stocklist_data()

# Read local stock list and clean data

In [3]:
# data folder
folder = os.getcwd() + '\\stock_list\\'

# file names
files = os.listdir( folder )

files

['amex.xlsx', 'nasdaq.xlsx', 'nyse.xlsx']

In [4]:
stolis_df_list = []
for f in files:
    df = pd.read_csv( folder + f )
    stolis_df_list.append(df)
    print(f.upper(),df.shape, '\n==================================================\n',
          df[['Name']].head() )
    print('==================================================\n')

AMEX.XLSX (309, 9) 
                                                 Name
0                            22nd Century Group, Inc
1              Aberdeen Asia-Pacific Income Fund Inc
2                 Aberdeen Australia Equity Fund Inc
3  Aberdeen Emerging Markets Equity Income Fund, ...
4                  Aberdeen Global Income Fund, Inc.

NASDAQ.XLSX (3447, 9) 
                                      Name
0                               111, Inc.
1  1347 Property Insurance Holdings, Inc.
2  1347 Property Insurance Holdings, Inc.
3                180 Degree Capital Corp.
4                 1-800 FLOWERS.COM, Inc.

NYSE.XLSX (3108, 9) 
                      Name
0  3D Systems Corporation
1              3M Company
2         500.com Limited
3             58.com Inc.
4                 8x8 Inc



In [5]:
# concatenate companies from three exhanges
stolis_df_ = pd.concat(stolis_df_list, axis = 0)

# drop out fund
stolis_df_ = stolis_df_[stolis_df_['industry'] == stolis_df_['industry']]

# drop dupplicated company names
stolis_df = stolis_df_.drop_duplicates(['Name']).reset_index(drop = True)

print('Total %s companies, unique %s companies.' % (stolis_df_.shape[0], stolis_df.shape[0]))

Total 5304 companies, unique 4793 companies.


# Prepare tweets data

In [6]:
os.listdir()

['.ipynb_checkpoints',
 'Data handle.ipynb',
 'data.xlsx',
 'event_selecting_logic.md',
 'marked_tweets.xlsx',
 'nasdaq.xls',
 'stock_list',
 'tweets.txt',
 'tweets_data.xlsx',
 'words_database.xlsx',
 'words_databse(1st_cleaned).xlsx',
 'word_list.csv',
 'yahoo_crawler.py']

In [7]:
# filename
filename = 'tweets.txt'

# read txt file
file = open(filename).read()

# convert json format to dataframe
data = pd.DataFrame(eval(file.replace('false', 'False').replace('true', 'True')))

# store tweets in excel
data.to_excel('data.xlsx')

In [8]:
def is_in(string, str_lst = ['data'], lower = True):
    ''' Detect whether words in *str_lst* exist in *string* or not.
    Input:
    
    -- string: string for examing
               str format
               
    -- str_lst: a list of key words
                list of str
                default is *[' data ']*
    
    -- lower: determine whether capital letter is ignored or not, 
              True -> ignore capital letters, transform all string to lower case;
              Fasle -> capital letters can't be ignored, both in *string* and *str_lst*.
    
    Output:
    
    -- if any key words is detedted:
           return a string with all key words emphathized
           
       else:
           return numpy.nan
    '''
    string = ' %s '%string
    if lower:
        
        def my_lable(string, str_lst = str_lst):
            for s in str_lst:
                string = re.sub(r'(?<=\W)%s(?=\W)'%s.lower(), ' {%s} '%s.upper(), string )
            return string

        string = string.lower()

        if my_lable(string, str_lst = str_lst) != string:
            return my_lable(string)

        else:
            return np.nan
        
    else:
        
        def my_lable(string, str_lst = str_lst):
            for s in str_lst:
                string = re.sub(r'(?<=\W)%s(?=\W)'%s, ' {%s} '%s.upper(), string )
            return string

        string = string

        if my_lable(string, str_lst = str_lst) != string:
            return my_lable(string)

        else:
            return np.nan

In [9]:
def count_pattern(string, dict_):
    pattern = re.compile(r'{ (.*?) }', re.S)
    items = re.findall(pattern, string)
    for i in items:
        try:
            dict_[i] += 1.
        except:
            dict_[i] = 1.
    return dict_

In [10]:
data.columns

Index(['created_at', 'favorite_count', 'id_str', 'is_retweet', 'retweet_count',
       'source', 'text'],
      dtype='object')

In [11]:
data['created_at'] = pd.to_datetime( data['created_at'])

In [12]:
data['text'] = data['text'].apply(lambda x: re.sub('https://\S+', '', x))

# Search STR_list

In [13]:
def search_str_lst(str_lst, print_ = True):
    assert type(str_lst) == list, 'Type of str_lst must be list!'
    i = 1
    dict_ = {}
    v_lst = data['text'].apply(is_in, str_lst= str_lst).dropna().values
    if print_:
        for v in v_lst:
            dict_ = count_pattern(v, dict_)
            print(v)
            print('================================================%s/%s\n'%( i, len(v_lst) ))
            i += 1
    return v_lst

In [14]:
v_lst = search_str_lst(['AGREE'])

 i  {AGREE}  with kim jong un of north korea that our personal relationship remains very good, perhaps the term excellent would be even more accurate, and that a third summit would be good in that we fully understand where we each stand. north korea has tremendous potential for....... 

 more apprehensions (captures)
at the southern border than in many years. border patrol amazing! country is full! system has been broken for many years. democrats in congress must  {AGREE}  to fix loopholes - no open borders (crimes &amp; drugs). will close southern border if necessary... 

 “the lowest average jobs number for any president since 1951, 4.1%. economy doing great. if the democrats win, it is all over.” @varneyco  @foxandfriends  i  {AGREE} ! 

 wow! a suffolk/usa today poll, just out, states, “50% of americans  {AGREE}  that  robert mueller’s investigation is a witch hunt.” @msnbc  very few think it is legit! we will soon find out? 

 prominent legal scholars  {AGREE}  that our actions to

In [15]:
drop_lst = '''Company,Laboratories,Inc,plc,corp,group,equities,pharmaceutical,technology,plc,coporation,co,Resource,
networks,green,texas,holdings,Inc,properties,holdings,energy,communications,limited,solutions,resources,brands,SUMMIT,
hunt,companies,health,restrants,services,chemical,int,l,arts,resources,Holdings,Holding,Inc,Cos,Ltd,Corp,Co,plc,PEOPLE,
red,space,under,Cos,Group,properties,Corporation,Incorporated,tree,business,city,Residential,Company,TOTAL,one,aid,up,
line,gas,network,black,federal,union ,best,air,water,U,S,Trust,Arts,Communications,Chemical,Lifesciences,JUST,usa,
Technologies,Systems,General,First,Street,Southern,Networks,Realty,Service,Class,A,Materials,Class,Cruise,Line,180,
Services,Financial,Resources,NATIONAL ,Foods,Scientific,Beauty,Realty,Communications,com,Automotive,Stores,Mueller,
Technologies,International,WEST ,Markets,Machines,Sciences,Exchange,Tool,Works,Dynamics,Bank,Investment,limited,Simply,
Laboratories,NEWS ,technology,Resources,Resorts,equities,energy,health,Parts,brands,and,a,at,on,take,of,Church,forward,
system,new,UNITED,Republic,OIL ,real,york,the,AMERICAN, america,state,C,Data,SECURITY ,companies,restrants,PLANS,can,
Industries,Gold,Management,Education,REIT,Acquisition,Partners,LP,China,Hospitality,Medical,Capital,Royalty,world,
Electronics,39,Enterprisesde,Investors,Industrial,Power,Products,Property,Insurance,Finance,Life,Worldwide,Electric,
if,now,all,care,nation,by,do,mexico,it,in,me,CHECK,great,sports,good,golf,big,AGREE,GROWTH,north,world,korea,forward,
Software,Pacific,Global,Bio,Entertainment,Media,Community,Estate,LLC,Hotels,for,Cool,first,second,third,fifth,fourth,
Strong,Healthcare,Cohen,Standard,Star,Opportunity,Level,Plus,HealthCare,Georgia,Place,States,Information,800,Wisconsin,
Source,Mark,Public,Way,Manufacturing,Funding,Better,South,Carolina,James,Beyond,Stock,Private,Career,European,Point,
Children,Citizens,Clean,Center,Consumer,County,Old,Country,Credit,Journal,Dollar,Victory,TRADE,Montana,East,Focus,
Ever,Live,Payments,Washington,Stay,Farmers,Choice,Indiana,Foundation,US,Five,Prime,Full,House,Fusion,Future,Times,Fix,
Troy,Henry,Home,Hope,Building,Infrastructure,Support,Money,Japan,Control,John,Kelly,Defense,End,Smart,Marine,Merit,
Con,Modern,Mr,My,NICE,Office,Ohio,Steel,Stop,Open,Re,Patrick,Virginia,Points,Popular,Positive,Progress,Safe,Safety,
Special,Games,Florida,Number,SMART,Missouri,Spirit,Market,support,Price,Two,Long,Joint,Meet,Trade,Top,TOP,Tuesday,de,
Wins,500,Interest,Communities,Pittsburgh,Lots,Cia,Family,Build,Canada,Cars,Fair,Israel,Clear,well,40,,Fire,Champion,
Dr,Government,Mississippi,Far,Fortune,Host,Game,Las,Argentina,Europe,Party,Pennsylvania,Post,RE,Ready,Robert,San,Six,
Joe,Team,France,Tennessee,AS,Met,Waters,White,Therapeutics,CALIFORNIA,strong,Recovery,Pharmaceuticals,Morning,Harvard,
flags, seven, strategic, foot, ca, institute, origin, commercial, waste, pro,Flow, paper, achieve, display,invitation,
art, island, grand, Chicago,Car, champions, associated, internet, super , blue, regional, purpose, Northeast ,Stone, 
age, box, midwest, yard,urban, ms, daily ,st, environment, independent,Events, zealand, inspired, ag, fixed, randny,
participation, struggles, gain ,block, science,Saving, pt, fat, overseas, revolution, brand, agricultural, exact, 
advanced, path, construction,Suburban, rapid, location, buy, senior, distance, project, el , color, self, display, 
rocket, Direct, initiative, prosperity, club, research, physical, tower, lifetime, product, food,Development, stage, 
vacation, forum, test, tech, sound, coast, handling, universe, contract,Fund, priority, British, concrete, emissions,
saving, four, natural, extended truck, light,Supply, forest, social, bar, magic, lake, pace, walker, achieve, pride, 
lines, dean, heritage, Music, digital, highway, shake, pan, ocean, nationwide, garden, grill, grid, customers, beach,
Integrated, reading, development, wave, ocean, match, central, age, restaurant, Yale, mid,Front, platform, goods, 
cross, ms, produce, turning, site, budget
'''.replace(' ', '').replace('\n', '').lower().split(',')

df = pd.read_excel('words_databse(1st_cleaned).xlsx')
drop_lst += df[df['drop'] == 0].index.tolist()


In [16]:
drop_set = {*drop_lst}

# Company name search

In [17]:
# split stock names
name_df = stolis_df[['Name']].copy()

name_df['words'] = pd.DataFrame( name_df['Name'].apply(lambda x: \
                                                        re.findall(r'(\w+)', x) ) )
name_df.head()

Unnamed: 0,Name,words
0,"22nd Century Group, Inc","[22nd, Century, Group, Inc]"
1,Acme United Corporation.,"[Acme, United, Corporation]"
2,"Actinium Pharmaceuticals, Inc.","[Actinium, Pharmaceuticals, Inc]"
3,"Adams Resources & Energy, Inc.","[Adams, Resources, Energy, Inc]"
4,AeroCentury Corp.,"[AeroCentury, Corp]"


In [18]:
def knock_out(lst, drop_lst = drop_set):
    ''' Knock out words in drop_set
    '''
    lst_ = []
    for i in lst:
        # if in drop_lst or length < 2, drop it
        if i.lower() in drop_lst or len(i)< 2:
            pass
        else:
            lst_.append(i)
            
    if len(lst_) > 0:
        return lst_
    
    else:
        return np.nan

name_df['dropped'] = name_df['words'].apply(knock_out)

In [19]:
dropped_companies = name_df[name_df['dropped'] != name_df['dropped']]
print('%s companies are dropped out from stock list.'%dropped_companies.shape[0])

279 companies are dropped out from stock list.


In [20]:
name_df = name_df.dropna()
print('%s companies are kept after dropping out.'%name_df.shape[0])

4514 companies are kept after dropping out.


# Re-arrange key-words and companies

In [21]:
name_df.head()

Unnamed: 0,Name,words,dropped
0,"22nd Century Group, Inc","[22nd, Century, Group, Inc]",[Century]
1,Acme United Corporation.,"[Acme, United, Corporation]",[Acme]
2,"Actinium Pharmaceuticals, Inc.","[Actinium, Pharmaceuticals, Inc]",[Actinium]
3,"Adams Resources & Energy, Inc.","[Adams, Resources, Energy, Inc]",[Adams]
4,AeroCentury Corp.,"[AeroCentury, Corp]",[AeroCentury]


In [22]:
# Save all words

In [23]:
def save_words_dict_data():
    words_dict = {}
    for i in range( name_df.shape[0] ):
        words = name_df['dropped'].values[i]
        comp  = name_df['Name'   ].values[i]

        for w in words:
            try:
                words_dict[w] += '%s, '%comp
            except:
                words_dict[w] = ''
                words_dict[w] += '%s, '%comp
    pd.DataFrame(words_dict, index = [0]).T.to_excel('words_database.xlsx')
    pass
save_words_dict_data()

In [24]:
words_dict = {}
for i in range( name_df.shape[0] ):
    words = name_df['dropped'].values[i]
    comp  = name_df['Name'   ].values[i]
    
    for w in words:
        try:
            words_dict[w] += [comp]
        except:
            words_dict[w] = []
            words_dict[w] += [comp]

# Drop zero search frequency key words

In [25]:
i = 0
t = time.time()
k_lst = words_dict.copy().keys()
for k in k_lst:
    if len(search_str_lst([k], False)) == 0:
        words_dict.pop(k)
        
    i += 1
    T = time.time() - t
    v = T/i
    Tt = len(k_lst) * v

    if i % 100 == 0:
        print('%.4f sec(s) elapsed,'%T, '%.4f sec(s) left,'%(Tt - T), 'total %.4f sec(s)'%Tt )

9.2366 sec(s) elapsed, 411.6763 sec(s) left, total 420.9129 sec(s)
18.0356 sec(s) elapsed, 392.9061 sec(s) left, total 410.9418 sec(s)
27.1570 sec(s) elapsed, 385.3574 sec(s) left, total 412.5144 sec(s)
35.9779 sec(s) elapsed, 373.9004 sec(s) left, total 409.8783 sec(s)
44.8189 sec(s) elapsed, 363.6606 sec(s) left, total 408.4795 sec(s)
54.7652 sec(s) elapsed, 361.1765 sec(s) left, total 415.9416 sec(s)
65.9072 sec(s) elapsed, 363.1485 sec(s) left, total 429.0557 sec(s)
76.1186 sec(s) elapsed, 357.4717 sec(s) left, total 433.5903 sec(s)
86.5496 sec(s) elapsed, 351.6800 sec(s) left, total 438.2296 sec(s)
97.4272 sec(s) elapsed, 346.5486 sec(s) left, total 443.9759 sec(s)
109.7375 sec(s) elapsed, 344.8749 sec(s) left, total 454.6124 sec(s)
119.7557 sec(s) elapsed, 335.0167 sec(s) left, total 454.7724 sec(s)
128.8656 sec(s) elapsed, 322.8578 sec(s) left, total 451.7234 sec(s)
138.1233 sec(s) elapsed, 311.4681 sec(s) left, total 449.5914 sec(s)
147.2521 sec(s) elapsed, 300.0998 sec(s) left

In [26]:
len(words_dict.keys())

481

# Find mentioned companies and mark key words out in tweets

In [27]:
i = 0
t = time.time()
def find_comp(string, words_dict = words_dict.copy(), lower = True,):
    ''' Find companies
    '''
    #====================================================
    string = ' %s '%string.lower()
        
    def my_label(string, str_lst):
        for s in str_lst:
            string = re.sub(r'(?<=\W)%s(?=\W)'%s.lower(), ' {%s} '%s.upper(), string )
        return string
    
    str_lst  = []
    comp_lst = []
    
    for k in words_dict.keys():
        
        comp = words_dict[k]
        
        if lower:
            k = k.lower()
            
            if  string != my_label(string, str_lst = [k] ):
                str_lst  += [k]
                
                comp_lst += comp
            
        else:
            if  string != my_label(string, str_lst = [k] ):
                str_lst  += [k]
                
                comp_lst += comp
                
    global i
    i += 1
    T = time.time() - t
    v = T/i
    Tt = data['text'].shape[0] * v
    
    if i % 100 == 0:
        print('%.4f sec(s) elapsed,'%T, '%.4f sec(s) left,'%(Tt - T), 'total %.4f sec(s)'%Tt )
    
    if len(str_lst) > 0:
        return my_label(string, str_lst = {*str_lst}), list( {*comp_lst} )
    
    else:
        return np.nan, np.nan

In [28]:
res = data['text'].apply(lambda x: pd.Series( find_comp(x) ) )

1.2208 sec(s) elapsed, 56.9743 sec(s) left, total 58.1951 sec(s)
2.5402 sec(s) elapsed, 58.0061 sec(s) left, total 60.5464 sec(s)
3.4214 sec(s) elapsed, 50.9444 sec(s) left, total 54.3658 sec(s)
4.1999 sec(s) elapsed, 45.8525 sec(s) left, total 50.0524 sec(s)
4.9210 sec(s) elapsed, 41.9956 sec(s) left, total 46.9165 sec(s)
5.6218 sec(s) elapsed, 39.0433 sec(s) left, total 44.6651 sec(s)
6.7167 sec(s) elapsed, 39.0238 sec(s) left, total 45.7405 sec(s)
7.5848 sec(s) elapsed, 37.6112 sec(s) left, total 45.1960 sec(s)
8.6314 sec(s) elapsed, 37.0864 sec(s) left, total 45.7178 sec(s)
9.6079 sec(s) elapsed, 36.1930 sec(s) left, total 45.8010 sec(s)
10.5765 sec(s) elapsed, 35.2581 sec(s) left, total 45.8346 sec(s)
11.3109 sec(s) elapsed, 33.6215 sec(s) left, total 44.9324 sec(s)
12.1479 sec(s) elapsed, 32.3974 sec(s) left, total 44.5453 sec(s)
12.9604 sec(s) elapsed, 31.1697 sec(s) left, total 44.1300 sec(s)
13.7843 sec(s) elapsed, 30.0223 sec(s) left, total 43.8066 sec(s)
14.5656 sec(s) elaps

In [29]:
res.columns = ['tweets', 'companies']

In [30]:
result = pd.concat([res, data['created_at']], axis = 1).dropna().reset_index(drop = True)

In [31]:
result.to_excel('marked_tweets.xlsx')