In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

%matplotlib inline

In [2]:
data = pd.read_csv('iShares_IBB_descriptions.csv')
data= data.rename(columns={'Unnamed: 0': 'Ticker'})

In [3]:
data.head(3)

Unnamed: 0,Ticker,Name,url,Sector,Industry,Employees,Description,Executive1,Executive2,Executive3,Executive4,Executive5
0,ABUS,Arbutus Biopharma Corporation,http://www.tekmirapharm.com,Healthcare,Biotechnology,122.0,"Arbutus Biopharma Corporation, a biopharmaceut...","{'name': 'Dr. Mark Joseph Murray Ph.D.', 'titl...","{'name': 'Mr. Bruce G. Cousins C.A., CPA', 'ti...","{'name': 'Dr. Peter Lutwyche Ph.D.', 'title':...","{'name': 'Dr. Michael J. Sofia Ph.D.', 'title'...","{'name': 'Dr. Elizabeth Howard Ph.D., J.D.', ..."
1,ACAD,ACADIA Pharmaceuticals Inc.,http://www.acadia-pharm.com,,,,"ACADIA Pharmaceuticals Inc., a biopharmaceutic...","{'name': 'Mr. Stephen R. Davis J.D.', 'title':...","{'name': 'Mr. Todd S. Young', 'title': 'Chief ...","{'name': 'Mr. Glenn F. Baity', 'title': 'Exec....","{'name': 'Dr. Srdjan R. Stankovic M.D., M.S.P....","{'name': 'Mr. James A. Nash', 'title': 'Sr. VP..."
2,ACHN,"Achillion Pharmaceuticals, Inc.",http://www.achillion.com,Healthcare,Biotechnology,81.0,"Achillion Pharmaceuticals, Inc., a biopharmace...","{'name': 'Dr. Milind S. Deshpande Ph.D.', 'tit...","{'name': 'Ms. Mary Kay Fenton', 'title': 'Chie...","{'name': 'Mr. Joseph Truitt', 'title': 'Chief...","{'name': 'Dr. David Apelian', 'title': 'Forme...","{'name': 'Glenn Schulman', 'title': 'Exec. Di..."


In [4]:
def tokenize_text(txt):
    stop_words = set(stopwords.words('english'))
    try:
        tokens_tmp = np.array(word_tokenize(txt))
        tokens = [w.strip() for w in tokens_tmp 
                  if w.isalpha()
                  if not w in stop_words]
    except:
        tokens = None
    return tokens

In [5]:
tokenized_description = data[['Ticker', 'Description']]
tokenized_description['tokens'] = tokenized_description['Description']\
                                        .apply(lambda x: tokenize_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
tokenized_description.head()

Unnamed: 0,Ticker,Description,tokens
0,ABUS,"Arbutus Biopharma Corporation, a biopharmaceut...","[Arbutus, Biopharma, Corporation, biopharmaceu..."
1,ACAD,"ACADIA Pharmaceuticals Inc., a biopharmaceutic...","[ACADIA, Pharmaceuticals, biopharmaceutical, c..."
2,ACHN,"Achillion Pharmaceuticals, Inc., a biopharmace...","[Achillion, Pharmaceuticals, biopharmaceutical..."
3,ACOR,"Acorda Therapeutics, Inc., a biopharmaceutical...","[Acorda, Therapeutics, biopharmaceutical, comp..."
4,ACRS,"Aclaris Therapeutics, Inc., a clinical-stage b...","[Aclaris, Therapeutics, biotechnology, company..."


### Count number of companies whose description mentions some keyword

In [19]:
def has_keyword(keyword, tokens):
    if tokens is not None:
        immuno = [token for token in tokens
                  if keyword in token.lower()]
        if len(immuno) > 0:
            return True
        else:
            return False
    else:
        return False

In [26]:
num_immuno_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('immuno',x)).sum()
num_neuro_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('neuro',x)).sum()
num_altz_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('alzheimer',x)).sum()
num_parkinson_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('parkinson',x)).sum()
num_onco_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('onco',x)).sum()
num_cancer_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('cancer',x)).sum()
num_metabol_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('metabol',x)).sum()
num_biological_companies = tokenized_description['tokens'].apply(lambda x: has_keyword('biological',x)).sum()

print("Among %d companies in the list..." % len(tokenized_description))
print("%d companies mention 'immuno-'" % num_immuno_companies)
print("%d companies mention 'neuro-'" % num_neuro_companies)
print("%d companies mention 'alzheimer'" % num_altz_companies)
print("%d companies mention 'parkinson'" % num_parkinson_companies)
print("%d companies mention 'onco-'" % num_onco_companies)
print("%d companies mention 'cancer'" % num_cancer_companies)
print("%d companies mention 'metabol-'" % num_metabol_companies)
print("%d companies mention 'biological'" % num_biological_companies)


Among 200 companies in the list...
38 companies mention 'immuno-'
35 companies mention 'neuro-'
4 companies mention 'alzheimer'
5 companies mention 'parkinson'
36 companies mention 'onco-'
83 companies mention 'cancer'
13 companies mention 'metabol-'
9 companies mention 'biological'
