In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np

### Load and process data

`data/combined_new.csv` contains:
  - company names (labelled `1`)
  - phrases of length (1-4) constructed randomly from english scrabble words (labelled `0`).


In [3]:
# load into pandas dataframe and separate
df = pd.read_csv('data/combined_new.csv', sep=',', names=['word','label'])
total_examples = len(df)
company_examples = len(df[df.label==1])
print(df.head())
print(' ')
print('we have {} company examples out of {} total examples'.format(company_examples, total_examples))

df_x=df['word'].values.astype('U')
df_y=df['label'].values.astype('U')
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1110)
y_train = [int(y) for y in y_train]
y_test = [int(y) for y in y_test]

                              word  label
0  technology management solutions      1
1             unsublimed bergstedt      0
2                       sartor nyc      1
3                     invulnerably      0
4      harmonies spahn overteeming      0
 
we have 36265 company examples out of 78303 total examples


### Making character ngrams from the words

Now, I create a vectorizer that takes words and counts their character-level ngrams with length 3-6 characters (inclusive).  

With this vectorizer I transform my word inputs and get a lookup table for the corresponding ngram names. 


In [4]:
# define range of ngram length to look at
ngram_range = (1,5)

# transform the phrases into their ngram counts
vectorizer = CountVectorizer(ngram_range=ngram_range, analyzer='char')

X = vectorizer.fit_transform(df_x)
X_train = vectorizer.transform(x_train)
X_test  = vectorizer.transform(x_test)

names = vectorizer.get_feature_names()

### Learning the most relevant features

I use a randomized logistic regression to find the ngrams that are most relevant to classifying the elelemnts of my training set.

To do this, I fit the character ngram inputs to the corresponding labels with a selection threshold for retaining each ngram in the model.

Then, I can use the returned mask of important features to get the best ngrams.

In [8]:
# run a logistic regression to return ngrams that are above 'selection threshold' of significant in the model
selection_threshold = 0.5
randomized_logistic = RandomizedLogisticRegression(selection_threshold=selection_threshold)
randomized_logistic.fit(X_train, y_train)

RandomizedLogisticRegression(C=1, fit_intercept=True, memory=None, n_jobs=1,
               n_resampling=200, normalize=True, pre_dispatch='3*n_jobs',
               random_state=None, sample_fraction=0.75, scaling=0.5,
               selection_threshold=0.5, tol=0.001, verbose=False)

In [9]:
# get the final ngrams
mask = randomized_logistic.get_support()
features = np.array(names)[mask]
print('found {} ngrams:'.format(len([f for f in features])))
print([f for f in features])

found 265 ngrams:
[' ', ' and ', ' ange', ' app', ' ass', ' by ', ' capi', ' com', ' comm', ' deve', ' engi', ' esta', ' for ', ' fund', ' i', ' in', ' lab', ' ll', ' medi', ' of', ' of ', ' secu', ' sys', ' syst', ' tech', 'a', 'abl', 'ably', 'ae', 'agazi', 'ah', 'al ', 'ali', 'ameri', 'an', 'ange', 'anti', 'app', 'apps', 'are', 'asia', 'at', 'au', 'bever', 'box', 'brand', 'care', 'cc', 'che', 'click', 'club', 'code', 'cot', 'creat', 'crui', 'cz', 'd', 'dail', 'dat', 'data', 'dc', 'desk', 'dv', 'e', 'e ', 'ealt', 'easy', 'ed', 'edg', 'edi', 'edia', 'edu', 'eet', 'ei', 'em', 'emm', 'er', 'ergy', 'es', 'esear', 'ete', 'ett', 'ey', 'feed', 'ff', 'film', 'films', 'find', 'fit', 'fle', 'fly', 'food', 'for ', 'fund', 'g', 'game', 'games', 'geniu', 'gg', 'gree', 'group', 'h', 'ham', 'hl', 'hnolo', 'holdi', 'home', 'hub', 'hy', 'i', 'ias', 'ical', 'ics', 'ie', 'ier', 'iest', 'ight', 'imite', 'ing', 'insi', 'inst', 'is', 'ism', 'ix', 'k', 'l', 'l ', 'lab', 'labs', 'lass', 'lb', 'learn', 'lert'

In [10]:
pipe = Pipeline([
  ('feature_selection', RandomizedLogisticRegression(selection_threshold=selection_threshold)),
  ('classification', LogisticRegression())
])
pipe.fit(X_train, y_train)
print('training accuracy : {}'.format(pipe.score(X_train, y_train)))
print('testing accuracy : {}'.format(pipe.score(X_test, y_test)))

training accuracy : 0.8133999553015548
testing accuracy : 0.8072281463508078


In [11]:
# sort into positive and negative ngrams over some threshold

# minimum absolute value of the coefficient needed to save the ngram
threshold = 1

params = pipe.get_params()
logistic = params['classification']
coeffs = logistic.coef_[0]
coef_dict = {f:c for f,c in zip(features, coeffs)}

positive_features = [f for f, c in coef_dict.items() if abs(c) > threshold and c > 0]
negative_features = [f for f, c in coef_dict.items() if abs(c) > threshold and c < 0]

print('positive features : {}\n{}\n'.format(len(positive_features), positive_features))
print('negative features : {}\n{}'.format(len(negative_features), negative_features))
with open('./data/pos_ngrams.txt', 'w') as f:
    f.write(','.join([str(feat) for feat in positive_features]))
with open('./data/neg_ngrams.txt', 'w') as f:
    f.write(','.join([str(feat) for feat in negative_features]))

positive features : 141
[' ange', ' ass', ' by ', ' capi', ' com', ' comm', ' deve', ' engi', ' for ', ' i', ' in', ' lab', ' ll', ' medi', ' of', ' of ', ' secu', ' tech', 'agazi', 'al ', 'at', 'au', 'bever', 'care', 'club', 'code', 'creat', 'crui', 'cz', 'desk', 'e', 'e ', 'edia', 'ei', 'em', 'emm', 'er', 'ergy', 'esear', 'ete', 'ett', 'feed', 'film', 'films', 'find', 'fit', 'fle', 'fly', 'food', 'fund', 'g', 'game', 'games', 'geniu', 'ham', 'hnolo', 'hub', 'hy', 'i', 'ical', 'ics', 'ie', 'ing', 'is', 'ix', 'lab', 'labs', 'learn', 'lert', 'ley', 'life', 'ligen', 'link', 'littl', 'lla', 'llc', 'm', 'man', 'mi', 'mind', 'mol', 'my', 'n', 'nagem', 'nd ', 'new ', 'nnect', 'nture', 'nves', 'o', 'oe', 'oid', 'onlin', 'oph', 'os', 'pay', 'pel', 'ph', 'pharm', 'power', 'r', 'rban ', 'rer', 'rk', 'rket', 'rr', 'sale', 'sch', 'searc', 'senge', 'shar', 'share', 'sho', 'shop', 'sig', 'simpl', 'skill', 'smart', 'soci', 'softw', 'sport', 'sta', 'start', 'stor', 'stud', 'style', 'sulti', 'sys', 't 