# Imports and classes instanciations

In [1]:
import pandas as pd
import hygia as hg

pre_process_data = hg.PreProcessData(country="MEXICO")
augment_data = hg.AugmentData(country="MEXICO")
feature_engineering = hg.FeatureEngineering(country="MEXICO")
rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl',
                                normalization_absolutes_file='../data/models/normalization_absolutes.csv')

[33mrunning feature engineering with configs below...[37m
[1mlanguage -> [22mes
[1mdimensions -> [22m25


# Load Data

In [2]:
file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'
df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')

# Augment Data with context validations

In [3]:
df = augment_data.augment_data(df, zipcode_column_name='ZIP_CODE_L')

# Add new columns

1. Concatenate address
2. All features columns:
    - Key Smash
    - Regex
    - Word Embedding

In [4]:
concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'
df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)
df = feature_engineering.extract_features(df, concatened_column_name)

aliases indified: [1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> [22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']
handle null values in the column [1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2[22m
extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2


# Check features columns names

In [5]:
all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]
model_features_columns = [col for col in all_features_columns \
                        if not col.startswith('feature_we') \
                        and 'ratio_of_numeric_digits_squared' not in col]
model_features_columns

['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_re_contain

# Predict using pre-trained model

In [6]:
df['prediction_is_key_smash'] = rf_model.predict(df[model_features_columns], concatened_column_name)
df['prediction_is_key_smash'].value_counts()

[33mrunning model...[37m


0.0    2518008
1.0       2288
Name: prediction_is_key_smash, dtype: int64

# Save predicted data

In [7]:
df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \
    .drop_duplicates(subset=[concatened_column_name]) \
    .to_csv(f'../data/tmp/prediction_rf_ks_we_regex_enrich_normal.csv')