Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix with filtering words against scrabble list #7

Merged
merged 4 commits into from Jul 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
40 changes: 26 additions & 14 deletions filter_lookup.py
Expand Up @@ -9,55 +9,67 @@

def open_data(filename):
out_list = []
print('loading data from {}'.format(filename))
with open(filename, 'rt') as f:
print("loading data from {}".format(filename))
with open(filename, "rt") as f:
reader = csv.reader(f)
for row in reader:
out_list += row
print('found {} elements'.format(len(out_list)))
print("found {} elements".format(len(out_list)))
return out_list


def write_data(filename, filtered_startups):
print('writing that to file at : {}'.format(filename))
print("writing that to file at : {}".format(filename))
with open(filename, "w") as f:
writer = csv.writer(f, lineterminator='\n')
writer = csv.writer(f, lineterminator="\n")
writer.writerows([filtered_startups])


def filter_list(in_list, scrabble_list):
filtered_list = set()
for i, s in enumerate(in_list):
if i % 100 == 0:
print('percent done: {} % \t elements removed: {}'.format(int(1000 * i / len(in_list)) / 10,
i - len(filtered_list)))
print(
"percent done: {} % \t elements removed: {}".format(
int(1000 * i / len(in_list)) / 10, i - len(filtered_list)
)
)
if s.lower() not in scrabble_list:
if any([word.lower() not in scrabble_list for word in s]):
words = s.split(" ")
if any([word.lower() not in scrabble_list for word in words]):
filtered_list.add(s)
filtered_list = list(filtered_list)
print('now have {} elements, removed {}'.format(len(filtered_list), len(in_list) - len(filtered_list)))
print(
"now have {} elements, removed {}".format(
len(filtered_list), len(in_list) - len(filtered_list)
)
)
return filtered_list


def parse_cmi():
argv = sys.argv

# defaults
read_file = 'data/company/startups.csv'
scrabble_file = 'data/company/english_scrabble.txt'
write_file = 'data/company/startups_filtered.csv'
read_file = "data/company/startups.csv"
scrabble_file = "data/company/english_scrabble.txt"
write_file = "data/company/startups_filtered.csv"

out_files = [read_file, scrabble_file, write_file]

# read in command line args
if len(argv) > 1:
for i, filename in enumerate(argv):
out_files[i] = filename
# ignore the script name
if i == 0:
continue
out_files[i - 1] = filename

return tuple(out_files)


if __name__ == '__main__':
if __name__ == "__main__":

read_file, scrabble_file, write_file = parse_cmi()

in_list = open_data(read_file)
Expand Down
109 changes: 61 additions & 48 deletions ngrams/create_ngrams.py
Expand Up @@ -4,37 +4,42 @@
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np
import cloudpickle


def load_file(fname='data/combined_new.csv'):
print('\nloading file {}...'.format(fname))
df = pd.read_csv(fname, sep=',', names=['word', 'label'])
def load_file(fname="data/combined_new.csv"):
print("\nloading file {}...".format(fname))
df = pd.read_csv(fname, sep=",", names=["word", "label"])
total_examples = len(df)
street_examples = len(df[df.label == 1])
print(df.head())
print('we have {} company examples out of {} total examples'.format(
street_examples, total_examples))
print(
"we have {} company examples out of {} total examples".format(
street_examples, total_examples
)
)
return df


def prep_data(df):
print('\nsplitting into test and train...')
df_x = df['word'].values.astype('U')
df_y = df['label'].values.astype('U')
print("\nsplitting into test and train...")
df_x = df["word"].values.astype("U")
df_y = df["label"].values.astype("U")
x_train, x_test, y_train, y_test = train_test_split(
df_x, df_y, test_size=0.2, random_state=1110)
df_x, df_y, test_size=0.2, random_state=1110
)
y_train = [int(y) for y in y_train]
y_test = [int(y) for y in y_test]
return x_train, x_test, y_train, y_test, df_x


def transorm_ngrams(df_x, x_train, x_test):
print('\ntransforming inputs with ngrams...')
vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char')
print("\ntransforming inputs with ngrams...")
vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer="char")
X = vectorizer.fit_transform(df_x)
X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
Expand All @@ -43,60 +48,71 @@ def transorm_ngrams(df_x, x_train, x_test):


def get_features(X_train, y_train, names, selection_threshold=0.2):
print('\ngetting features with randomized logistic regression...')
print('using a selection threshold of {}'.format(selection_threshold))
print("\ngetting features with randomized logistic regression...")
print("using a selection threshold of {}".format(selection_threshold))
randomized_logistic = RandomizedLogisticRegression(
selection_threshold=selection_threshold)
selection_threshold=selection_threshold
)
randomized_logistic.fit(X_train, y_train)
mask = randomized_logistic.get_support()
features = np.array(names)[mask]
print('found {} ngrams:'.format(len([f for f in features])))
print("found {} ngrams:".format(len([f for f in features])))
print([f for f in features])
return features


def run_logreg(X_train, y_train, selection_threshold=0.2):
print('\nrunning logistic regression...')
print('using a selection threshold of {}'.format(selection_threshold))
pipe = Pipeline([
('feature_selection', RandomizedLogisticRegression(
selection_threshold=selection_threshold)),
('classification', LogisticRegression())
])
print("\nrunning logistic regression...")
print("using a selection threshold of {}".format(selection_threshold))
pipe = Pipeline(
[
(
"feature_selection",
RandomizedLogisticRegression(selection_threshold=selection_threshold),
),
("classification", LogisticRegression()),
]
)
pipe.fit(X_train, y_train)
print('training accuracy : {}'.format(pipe.score(X_train, y_train)))
print('testing accuracy : {}'.format(pipe.score(X_test, y_test)))
print("training accuracy : {}".format(pipe.score(X_train, y_train)))
print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
return pipe


def get_pos_neg(pipe, features, f_pos='./data/pos_ngrams.txt', f_neg='./data/neg_ngrams.txt', cutoff=0.5):
print('\nseparating into positive and negative ngrams...')
print('using a cutoff of {}'.format(cutoff))
def get_pos_neg(
pipe,
features,
f_pos="./data/pos_ngrams.txt",
f_neg="./data/neg_ngrams.txt",
cutoff=0.5,
):
print("\nseparating into positive and negative ngrams...")
print("using a cutoff of {}".format(cutoff))
params = pipe.get_params()
logistic = params['classification']
logistic = params["classification"]
coeffs = logistic.coef_[0]
coef_dict = {f: c for f, c in zip(features, coeffs)}
positive_features = [f for f, c in coef_dict.items() if abs(c) > cutoff and c > 0]
negative_features = [f for f, c in coef_dict.items() if abs(c) > cutoff and c < 0]
print('positive ngrams : {}\n{}'.format(len(positive_features), positive_features))
print('')
print('negative ngrams : {}\n{}'.format(len(negative_features), negative_features))
print('writing to files {} and {}'.format(f_pos, f_neg))
with open(f_pos, 'w') as f:
f.write('\n'.join([str(feat) for feat in positive_features]))
with open(f_neg, 'w') as f:
f.write('\n'.join([str(feat) for feat in negative_features]))
print("positive ngrams : {}\n{}".format(len(positive_features), positive_features))
print("")
print("negative ngrams : {}\n{}".format(len(negative_features), negative_features))
print("writing to files {} and {}".format(f_pos, f_neg))
with open(f_pos, "w") as f:
f.write("\n".join([str(feat) for feat in positive_features]))
with open(f_neg, "w") as f:
f.write("\n".join([str(feat) for feat in negative_features]))


def write_all_ngrams(features, fname_base='./data/ngram_'):
def write_all_ngrams(features, fname_base="./data/ngram_"):
for i, f in enumerate(features):
fname = fname_base + str(i) + '.txt'
with open(fname, 'w') as f:
f.write(str(f))
fname = fname_base + str(i) + ".txt"
with open(fname, "w") as f:
f.write(str(f))


if __name__ == '__main__':
file_in = 'data/combined_new.csv'
if __name__ == "__main__":
file_in = "data/combined_new.csv"
ST = 0.4
cutoff = 0.9
expand = True
Expand All @@ -105,13 +121,10 @@ def write_all_ngrams(features, fname_base='./data/ngram_'):
X_train, X_test, names = transorm_ngrams(df_x, x_train, x_test)
features = get_features(X_train, y_train, names, selection_threshold=ST)
if expand:
fname_base = './data/ngram_'
fname_base = "./data/ngram_"
write_all_ngrams(features, fname_base=fname_base)
else:
file_pos = '../phrase_match_test/regex_phrase_match_demo/company_data/data/pos_ngrams.txt'
file_neg = '../phrase_match_test/regex_phrase_match_demo/company_data/data/neg_ngrams.txt'
file_pos = "../phrase_match_test/regex_phrase_match_demo/company_data/data/pos_ngrams.txt"
file_neg = "../phrase_match_test/regex_phrase_match_demo/company_data/data/neg_ngrams.txt"
pipe = run_logreg(X_train, y_train, selection_threshold=ST)
get_pos_neg(pipe, features, cutoff=cutoff)



65 changes: 34 additions & 31 deletions ngrams/make_dataset.py
Expand Up @@ -3,81 +3,84 @@
from numpy.random import randint
import numpy as np


def load_dataset(fname):
company_df = pd.read_csv(fname)
num_open_addr = len(company_df.index)
print('{} addresses in dataset {}'.format(num_open_addr, fname))
print("{} addresses in dataset {}".format(num_open_addr, fname))
return company_df


if __name__ == "__main__":
# files containing data
company_fname = 'data/companies.csv'
english_fname = 'data/english_scrabble.txt'
names_fname = 'data/names.txt'
# files containing data
company_fname = "data/companies.csv"
english_fname = "data/english_scrabble.txt"
names_fname = "data/names.txt"

# load dataframes from files
company_df = load_dataset(company_fname)[['Company Name']].copy()
company_df = load_dataset(company_fname)[["Company Name"]].copy()
english_df = load_dataset(english_fname)
names_df = load_dataset(names_fname)

# rename words to element
company_df = company_df.rename(index=str, columns={'Company Name':'element'})
english_df = english_df.rename(index=str, columns={'dripstone':'element'})
names_df = names_df.rename(index=str, columns={'mcleese':'element'})
company_df = company_df.rename(index=str, columns={"Company Name": "element"})
english_df = english_df.rename(index=str, columns={"dripstone": "element"})
names_df = names_df.rename(index=str, columns={"mcleese": "element"})

num_words = 4
other_df = pd.concat([english_df, names_df])
num_elements = len(other_df.index)
other_df = other_df.sample(frac=1).reset_index(drop=True)

other_df_copy = other_df.copy()
other_df = pd.DataFrame(columns=['element','label'])
other_df = pd.DataFrame(columns=["element", "label"])

print('creating new dataset with multiple words')
print("creating new dataset with multiple words")
count = 0
while count < num_elements:
ws = randint(1,num_words)
ws = randint(1, num_words)
new_words = []
for i in range(count,count+ws):
new_words.append(str(other_df_copy.at[i,'element']))
for i in range(count, count + ws):
new_words.append(str(other_df_copy.at[i, "element"]))
count += ws
other_df = other_df.append({'element' : ' '.join(new_words)}, ignore_index=True)
other_df = other_df.append({"element": " ".join(new_words)}, ignore_index=True)
if count % 1000 == 0:
print('wrote {} of {} examples'.format(count, num_elements))
print("wrote {} of {} examples".format(count, num_elements))

# label elements
company_df['label'] = 1
other_df['label'] = 0
company_df["label"] = 1
other_df["label"] = 0

# concatenate all datasets
print('combining datasets')
print('\tother dataset has {} examples'.format(len(other_df.index)))
print('\tcompany dataset has {} examples'.format(len(company_df.index)))
company_df = company_df.sample(frac=1/3)
print('\twhich was reduced to {} examples'.format(len(company_df.index)))
print("combining datasets")
print("\tother dataset has {} examples".format(len(other_df.index)))
print("\tcompany dataset has {} examples".format(len(company_df.index)))
company_df = company_df.sample(frac=1 / 3)
print("\twhich was reduced to {} examples".format(len(company_df.index)))

combined_df = pd.concat([company_df, other_df])

# shuffle the datasets
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# send to lower case
combined_df['element'] = combined_df['element'].str.lower()
combined_df["element"] = combined_df["element"].str.lower()

# get rid of spaces
#combined_df['element'] = combined_df['element'].str.replace(' ','')
# combined_df['element'] = combined_df['element'].str.replace(' ','')

# get rid of company names that are too long
#combined_df = combined_df.loc[combined_df['element'].str.len() < 15]
# combined_df = combined_df.loc[combined_df['element'].str.len() < 15]

# get rid of company names with numbers
combined_df = combined_df.loc[combined_df['element'].str.replace(' ','').str.isalpha()]
combined_df = combined_df.loc[
combined_df["element"].str.replace(" ", "").str.isalpha()
]

# reset index
combined_df.reset_index(drop = True, inplace=True)
combined_df.reset_index(drop=True, inplace=True)

# write to file
print('writing to file')
# write to file
print("writing to file")
print(combined_df.head())
combined_df.to_csv('data/combined.csv', header=False, index=False)
combined_df.to_csv("data/combined.csv", header=False, index=False)