In [1]:
import pandas as pd
import hygia as hg
import pickle
import time

# Classes instanciations

NOTE: Please check if the model_path matches your path

In [2]:
pre_process_data = hg.PreProcessData(country="MEXICO")
augment_data = hg.AugmentData(country="MEXICO")
feature_engineering = hg.FeatureEngineering(country="MEXICO")
annotate_data = hg.AnnotateData()
new_rf_model = hg.RandomForestModel()

[33mrunning feature engineering with configs below...[37m
[1mlanguage -> [22mes
[1mdimensions -> [22m25


# Load Data

NOTE: Please check if the file_path matches your data

In [3]:
file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'
df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')

# Add new columns

1. Concatenate address
2. All features columns:
    - Key Smash
    - Regex
    - Word Embedding

NOTE: Please check if the columns names matches your data

In [4]:
concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'
df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)
df = feature_engineering.extract_features(df, concatened_column_name)

aliases indified: [1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> [22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']
handle null values in the column [1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2[22m
extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2


# Check new columns names

In [5]:
all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]
all_features_columns

['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_ratio_of_numeric_digits_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_9_concat_STREET_A

# Select Features
- remove word embeddings
- remove key smash feature: ratio_of_numeric_digits_squared

In [6]:
selected_features = [col for col in all_features_columns \
                        if not col.startswith('feature_we') \
                        and 'ratio_of_numeric_digits_squared' not in col]
selected_features

['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contain

# Annotate data

In [7]:
key_smash_thresholds = {
    'count_sequence_squared_vowels': 1.00,
    'count_sequence_squared_consonants': 1.999,
    'count_sequence_squared_special_characters': 2.2499,
    # 'ratio_of_numeric_digits_squared': 2.9,
    'average_of_char_count_squared': 2.78,
}

df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)
df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()

[33mrunning annotate data with configs below...[37m
[1mthresholds -> [22m{'count_sequence_squared_vowels': 1.0, 'count_sequence_squared_consonants': 1.999, 'count_sequence_squared_special_characters': 2.2499, 'average_of_char_count_squared': 2.78}
column -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2


valid                             1344254
key_smash                             657
contains_email                        569
contains_exactly_the_word_test        177
only_special_characters               144
contains_context_invalid_words        128
contains_exactly_the_word_dell        125
only_numbers                          106
only_one_char                          14
contains_exactly_invalid_words         10
is_substring_of_column_name             3
contains_date                           1
empty                                   1
Name: target, dtype: int64

In [8]:
df['target'].value_counts()

valid                             2511527
contains_context_invalid_words       3079
key_smash                            1494
only_special_characters              1291
contains_email                       1048
contains_exactly_the_word_test        667
contains_exactly_the_word_dell        553
only_one_char                         287
only_numbers                          239
empty                                  71
contains_exactly_invalid_words         26
is_substring_of_column_name            12
contains_date                           2
Name: target, dtype: int64

# Experiment: retrain model

In [9]:
scores = new_rf_model.train_and_get_scores(df, concatened_column_name, selected_features)

[33mtranning model...[37m
[32mdone[37m
[33mget model score...[37m
[1maccuracy -> [22m0.9987146529562982
[1mprecision -> [22m0.9946524064171123
[1mrecall -> [22m1.0
[1mf1 -> [22m0.9973190348525469


# Predict using pre-trained model

In [10]:
df['prediction'] = new_rf_model.predict(df[selected_features], concatened_column_name)
df.drop_duplicates(subset=[concatened_column_name])['prediction'].value_counts()

[33mrunning model...[37m


0.0    1345413
1.0        776
Name: prediction, dtype: int64

# Save model and predicted data

In [11]:
new_rf_model.export_model('../data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl',
                          '../data/models/normalization_absolutes.csv')

[33mexporting model and normalization absolutes...[37m


In [12]:
df[df['prediction'] == 1][[concatened_column_name, 'target', 'prediction']] \
    .drop_duplicates(subset=[concatened_column_name]) \
    .to_csv(f'../data/tmp/{time.strftime("%Y%m%d-%H%M%S")}prediction_rf_ks_regex_enrich_normal.csv')