RasaHQ · MetcalfeTom · Jul 22, 2019 · May 27, 2019 · May 27, 2019 · Jul 22, 2019
diff --git a/filter_lookup.py b/filter_lookup.py
@@ -9,55 +9,67 @@
 
 def open_data(filename):
     out_list = []
-    print('loading data from {}'.format(filename))
-    with open(filename, 'rt') as f:
+    print("loading data from {}".format(filename))
+    with open(filename, "rt") as f:
         reader = csv.reader(f)
         for row in reader:
             out_list += row
-    print('found {} elements'.format(len(out_list)))
+    print("found {} elements".format(len(out_list)))
     return out_list
 
 
 def write_data(filename, filtered_startups):
-    print('writing that to file at : {}'.format(filename))
+    print("writing that to file at : {}".format(filename))
     with open(filename, "w") as f:
-        writer = csv.writer(f, lineterminator='\n')
+        writer = csv.writer(f, lineterminator="\n")
         writer.writerows([filtered_startups])
 
 
 def filter_list(in_list, scrabble_list):
     filtered_list = set()
     for i, s in enumerate(in_list):
         if i % 100 == 0:
-            print('percent done: {} % \t  elements removed: {}'.format(int(1000 * i / len(in_list)) / 10,
-                                                                       i - len(filtered_list)))
+            print(
+                "percent done: {} % \t  elements removed: {}".format(
+                    int(1000 * i / len(in_list)) / 10, i - len(filtered_list)
+                )
+            )
         if s.lower() not in scrabble_list:
-            if any([word.lower() not in scrabble_list for word in s]):
+            words = s.split(" ")
+            if any([word.lower() not in scrabble_list for word in words]):
                 filtered_list.add(s)
     filtered_list = list(filtered_list)
-    print('now have {} elements, removed {}'.format(len(filtered_list), len(in_list) - len(filtered_list)))
+    print(
+        "now have {} elements, removed {}".format(
+            len(filtered_list), len(in_list) - len(filtered_list)
+        )
+    )
     return filtered_list
 
 
 def parse_cmi():
     argv = sys.argv
 
     # defaults
-    read_file = 'data/company/startups.csv'
-    scrabble_file = 'data/company/english_scrabble.txt'
-    write_file = 'data/company/startups_filtered.csv'
+    read_file = "data/company/startups.csv"
+    scrabble_file = "data/company/english_scrabble.txt"
+    write_file = "data/company/startups_filtered.csv"
 
     out_files = [read_file, scrabble_file, write_file]
 
     # read in command line args
     if len(argv) > 1:
         for i, filename in enumerate(argv):
-            out_files[i] = filename
+            # ignore the script name
+            if i == 0:
+                continue
+            out_files[i - 1] = filename
 
     return tuple(out_files)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
+
     read_file, scrabble_file, write_file = parse_cmi()
 
     in_list = open_data(read_file)

diff --git a/ngrams/create_ngrams.py b/ngrams/create_ngrams.py
@@ -4,37 +4,42 @@
 from sklearn.pipeline import Pipeline
 from sklearn.feature_selection import SelectFromModel
 import warnings
+
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 import pandas as pd
 import numpy as np
 import cloudpickle
 
 
-def load_file(fname='data/combined_new.csv'):
-    print('\nloading file {}...'.format(fname))
-    df = pd.read_csv(fname, sep=',', names=['word', 'label'])
+def load_file(fname="data/combined_new.csv"):
+    print("\nloading file {}...".format(fname))
+    df = pd.read_csv(fname, sep=",", names=["word", "label"])
     total_examples = len(df)
     street_examples = len(df[df.label == 1])
     print(df.head())
-    print('we have {} company examples out of {} total examples'.format(
-        street_examples, total_examples))
+    print(
+        "we have {} company examples out of {} total examples".format(
+            street_examples, total_examples
+        )
+    )
     return df
 
 
 def prep_data(df):
-    print('\nsplitting into test and train...')
-    df_x = df['word'].values.astype('U')
-    df_y = df['label'].values.astype('U')
+    print("\nsplitting into test and train...")
+    df_x = df["word"].values.astype("U")
+    df_y = df["label"].values.astype("U")
     x_train, x_test, y_train, y_test = train_test_split(
-        df_x, df_y, test_size=0.2, random_state=1110)
+        df_x, df_y, test_size=0.2, random_state=1110
+    )
     y_train = [int(y) for y in y_train]
     y_test = [int(y) for y in y_test]
     return x_train, x_test, y_train, y_test, df_x
 
 
 def transorm_ngrams(df_x, x_train, x_test):
-    print('\ntransforming inputs with ngrams...')
-    vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char')
+    print("\ntransforming inputs with ngrams...")
+    vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer="char")
     X = vectorizer.fit_transform(df_x)
     X_train = vectorizer.transform(x_train)
     X_test = vectorizer.transform(x_test)
@@ -43,60 +48,71 @@ def transorm_ngrams(df_x, x_train, x_test):
 
 
 def get_features(X_train, y_train, names, selection_threshold=0.2):
-    print('\ngetting features with randomized logistic regression...')
-    print('using a selection threshold of {}'.format(selection_threshold))
+    print("\ngetting features with randomized logistic regression...")
+    print("using a selection threshold of {}".format(selection_threshold))
     randomized_logistic = RandomizedLogisticRegression(
-        selection_threshold=selection_threshold)
+        selection_threshold=selection_threshold
+    )
     randomized_logistic.fit(X_train, y_train)
     mask = randomized_logistic.get_support()
     features = np.array(names)[mask]
-    print('found {} ngrams:'.format(len([f for f in features])))
+    print("found {} ngrams:".format(len([f for f in features])))
     print([f for f in features])
     return features
 
 
 def run_logreg(X_train, y_train, selection_threshold=0.2):
-    print('\nrunning logistic regression...')
-    print('using a selection threshold of {}'.format(selection_threshold))
-    pipe = Pipeline([
-        ('feature_selection', RandomizedLogisticRegression(
-            selection_threshold=selection_threshold)),
-        ('classification', LogisticRegression())
-    ])
+    print("\nrunning logistic regression...")
+    print("using a selection threshold of {}".format(selection_threshold))
+    pipe = Pipeline(
+        [
+            (
+                "feature_selection",
+                RandomizedLogisticRegression(selection_threshold=selection_threshold),
+            ),
+            ("classification", LogisticRegression()),
+        ]
+    )
     pipe.fit(X_train, y_train)
-    print('training accuracy : {}'.format(pipe.score(X_train, y_train)))
-    print('testing accuracy : {}'.format(pipe.score(X_test, y_test)))
+    print("training accuracy : {}".format(pipe.score(X_train, y_train)))
+    print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
     return pipe
 
 
-def get_pos_neg(pipe, features, f_pos='./data/pos_ngrams.txt', f_neg='./data/neg_ngrams.txt', cutoff=0.5):
-    print('\nseparating into positive and negative ngrams...')
-    print('using a cutoff of {}'.format(cutoff))
+def get_pos_neg(
+    pipe,
+    features,
+    f_pos="./data/pos_ngrams.txt",
+    f_neg="./data/neg_ngrams.txt",
+    cutoff=0.5,
+):
+    print("\nseparating into positive and negative ngrams...")
+    print("using a cutoff of {}".format(cutoff))
     params = pipe.get_params()
-    logistic = params['classification']
+    logistic = params["classification"]
     coeffs = logistic.coef_[0]
     coef_dict = {f: c for f, c in zip(features, coeffs)}
     positive_features = [f for f, c in coef_dict.items() if abs(c) > cutoff and c > 0]
     negative_features = [f for f, c in coef_dict.items() if abs(c) > cutoff and c < 0]
-    print('positive ngrams : {}\n{}'.format(len(positive_features), positive_features))
-    print('')
-    print('negative ngrams : {}\n{}'.format(len(negative_features), negative_features))    
-    print('writing to files {} and {}'.format(f_pos, f_neg))
-    with open(f_pos, 'w') as f:
-        f.write('\n'.join([str(feat) for feat in positive_features]))
-    with open(f_neg, 'w') as f:
-        f.write('\n'.join([str(feat) for feat in negative_features]))
+    print("positive ngrams : {}\n{}".format(len(positive_features), positive_features))
+    print("")
+    print("negative ngrams : {}\n{}".format(len(negative_features), negative_features))
+    print("writing to files {} and {}".format(f_pos, f_neg))
+    with open(f_pos, "w") as f:
+        f.write("\n".join([str(feat) for feat in positive_features]))
+    with open(f_neg, "w") as f:
+        f.write("\n".join([str(feat) for feat in negative_features]))
 
 
-def write_all_ngrams(features, fname_base='./data/ngram_'):
+def write_all_ngrams(features, fname_base="./data/ngram_"):
     for i, f in enumerate(features):
-        fname = fname_base + str(i) + '.txt'
-        with open(fname, 'w') as f:
-            f.write(str(f))      
+        fname = fname_base + str(i) + ".txt"
+        with open(fname, "w") as f:
+            f.write(str(f))
 
 
-if __name__ == '__main__':
-    file_in = 'data/combined_new.csv'
+if __name__ == "__main__":
+    file_in = "data/combined_new.csv"
     ST = 0.4
     cutoff = 0.9
     expand = True
@@ -105,13 +121,10 @@ def write_all_ngrams(features, fname_base='./data/ngram_'):
     X_train, X_test, names = transorm_ngrams(df_x, x_train, x_test)
     features = get_features(X_train, y_train, names, selection_threshold=ST)
     if expand:
-        fname_base = './data/ngram_'
+        fname_base = "./data/ngram_"
         write_all_ngrams(features, fname_base=fname_base)
     else:
-        file_pos = '../phrase_match_test/regex_phrase_match_demo/company_data/data/pos_ngrams.txt'
-        file_neg = '../phrase_match_test/regex_phrase_match_demo/company_data/data/neg_ngrams.txt'      
+        file_pos = "../phrase_match_test/regex_phrase_match_demo/company_data/data/pos_ngrams.txt"
+        file_neg = "../phrase_match_test/regex_phrase_match_demo/company_data/data/neg_ngrams.txt"
         pipe = run_logreg(X_train, y_train, selection_threshold=ST)
         get_pos_neg(pipe, features, cutoff=cutoff)
-
-
-
diff --git a/ngrams/make_dataset.py b/ngrams/make_dataset.py
@@ -3,81 +3,84 @@
 from numpy.random import randint
 import numpy as np
 
+
 def load_dataset(fname):
     company_df = pd.read_csv(fname)
     num_open_addr = len(company_df.index)
-    print('{} addresses in dataset {}'.format(num_open_addr, fname))
+    print("{} addresses in dataset {}".format(num_open_addr, fname))
     return company_df
 
 
 if __name__ == "__main__":
-	# files containing data
-    company_fname = 'data/companies.csv'
-    english_fname = 'data/english_scrabble.txt'
-    names_fname = 'data/names.txt'
+    # files containing data
+    company_fname = "data/companies.csv"
+    english_fname = "data/english_scrabble.txt"
+    names_fname = "data/names.txt"
 
     # load dataframes from files
-    company_df = load_dataset(company_fname)[['Company Name']].copy()
+    company_df = load_dataset(company_fname)[["Company Name"]].copy()
     english_df = load_dataset(english_fname)
     names_df = load_dataset(names_fname)
 
     # rename words to element
-    company_df = company_df.rename(index=str, columns={'Company Name':'element'})
-    english_df = english_df.rename(index=str, columns={'dripstone':'element'})
-    names_df = names_df.rename(index=str, columns={'mcleese':'element'})
+    company_df = company_df.rename(index=str, columns={"Company Name": "element"})
+    english_df = english_df.rename(index=str, columns={"dripstone": "element"})
+    names_df = names_df.rename(index=str, columns={"mcleese": "element"})
 
     num_words = 4
     other_df = pd.concat([english_df, names_df])
     num_elements = len(other_df.index)
     other_df = other_df.sample(frac=1).reset_index(drop=True)
 
     other_df_copy = other_df.copy()
-    other_df = pd.DataFrame(columns=['element','label'])
+    other_df = pd.DataFrame(columns=["element", "label"])
 
-    print('creating new dataset with multiple words')
+    print("creating new dataset with multiple words")
     count = 0
     while count < num_elements:
-        ws = randint(1,num_words)
+        ws = randint(1, num_words)
         new_words = []
-        for i in range(count,count+ws):
-            new_words.append(str(other_df_copy.at[i,'element']))
+        for i in range(count, count + ws):
+            new_words.append(str(other_df_copy.at[i, "element"]))
         count += ws
-        other_df = other_df.append({'element' : ' '.join(new_words)}, ignore_index=True)
+        other_df = other_df.append({"element": " ".join(new_words)}, ignore_index=True)
         if count % 1000 == 0:
-            print('wrote {} of {} examples'.format(count, num_elements))
+            print("wrote {} of {} examples".format(count, num_elements))
 
     # label elements
-    company_df['label'] = 1
-    other_df['label'] = 0
+    company_df["label"] = 1
+    other_df["label"] = 0
 
     # concatenate all datasets
-    print('combining datasets')
-    print('\tother dataset has {} examples'.format(len(other_df.index)))
-    print('\tcompany dataset has {} examples'.format(len(company_df.index)))
-    company_df = company_df.sample(frac=1/3)
-    print('\twhich was reduced to {} examples'.format(len(company_df.index)))
+    print("combining datasets")
+    print("\tother dataset has {} examples".format(len(other_df.index)))
+    print("\tcompany dataset has {} examples".format(len(company_df.index)))
+    company_df = company_df.sample(frac=1 / 3)
+    print("\twhich was reduced to {} examples".format(len(company_df.index)))
 
     combined_df = pd.concat([company_df, other_df])
 
     # shuffle the datasets
     combined_df = combined_df.sample(frac=1).reset_index(drop=True)
 
     # send to lower case
-    combined_df['element'] = combined_df['element'].str.lower()
+    combined_df["element"] = combined_df["element"].str.lower()
 
     # get rid of spaces
-    #combined_df['element'] = combined_df['element'].str.replace(' ','')
+    # combined_df['element'] = combined_df['element'].str.replace(' ','')
 
     # get rid of company names that are too long
-    #combined_df = combined_df.loc[combined_df['element'].str.len() < 15]
+    # combined_df = combined_df.loc[combined_df['element'].str.len() < 15]
 
     # get rid of company names with numbers
-    combined_df = combined_df.loc[combined_df['element'].str.replace(' ','').str.isalpha()]
+    combined_df = combined_df.loc[
+        combined_df["element"].str.replace(" ", "").str.isalpha()
+    ]
 
     # reset index
-    combined_df.reset_index(drop = True, inplace=True)
+    combined_df.reset_index(drop=True, inplace=True)
 
-	# write to file
-    print('writing to file')
+    # write to file
+    print("writing to file")
     print(combined_df.head())
-    combined_df.to_csv('data/combined.csv', header=False, index=False)
+    combined_df.to_csv("data/combined.csv", header=False, index=False)