In [1]:
import pandas as pd
import hygia as hg
import pickle

# Classes instanciations

NOTE: Please check if the model_path matches your path

In [3]:
pre_process_data = hg.PreProcessData()
feature_engineering = hg.FeatureEngineering()
annotate_data = hg.AnnotateData()
new_rf_model = hg.RandomForestModel()

# Load Data

NOTE: Please check if the file_path matches your data

In [4]:
file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'
df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')

# Add new columns

1. Concatenate address
2. All features columns:
    - Key Smash
    - Regex
    - Word Embedding

NOTE: Please check if the columns names matches your data

In [5]:
concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'
df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)
df = feature_engineering.extract_features(df, concatened_column_name)

# Check new columns names

In [6]:
all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]
all_features_columns

['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_ratio_of_numeric_digits_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',
 'feature_we_9_concat_STREET_A

# Annotate data

In [7]:
key_smash_thresholds = {
    'count_sequence_squared_vowels': 1.00,
    'count_sequence_squared_consonants': 1.999,
    'count_sequence_squared_special_characters': 2.2499,
    'ratio_of_numeric_digits_squared': 2.9,
    'average_of_char_count_squared': 2.78,
}

df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)
df

valid                             1363810
key_smash                             904
contains_email                        412
contains_exactly_the_word_test        182
only_special_characters               151
contains_exactly_the_word_dell        126
only_one_char                          16
contains_invalid_words                 10
only_white_spaces                       2
Name: target, dtype: int64

In [8]:
df['target'].value_counts()

valid                             2514911
key_smash                            1770
only_special_characters              1291
contains_email                        720
contains_exactly_the_word_test        667
contains_exactly_the_word_dell        553
only_one_char                         287
only_white_spaces                      71
contains_invalid_words                 26
Name: target, dtype: int64

# Experiment: retrain model

In [9]:
clf, scores = new_rf_model.train_and_get_scores(df, concatened_column_name, all_features_columns)
scores

{'accuracy': 0.998769987699877,
 'precision': 0.9965156794425087,
 'recall': 1.0,
 'f1': 0.9982547993019197}

# Predict using pre-trained model

In [10]:
df['prediction'] = new_rf_model.predict(df[all_features_columns].values)
df.drop_duplicates(subset=[concatened_column_name])['prediction'].value_counts()



0.0    1362585
1.0       3028
Name: prediction, dtype: int64

# Save model and predicted data

In [14]:
pickle.dump(clf, open('../data/models/RandomForest_Ksmash_WordEmbedding_Regex.pkl', 'wb'))

In [11]:
df[df['prediction'] == 1][[concatened_column_name, 'prediction']].drop_duplicates(subset=[concatened_column_name]).to_csv('../data/tmp/prediction.csv')

In [12]:
df['prediction'] = df['target'] == 'key_smash'
df['prediction_model'] = df['prediction']
df[df['prediction_model']!=df['prediction']][[concatened_column_name, 'target', 'prediction_model', 'prediction']] \
    .drop_duplicates(subset=[concatened_column_name]) \
    .to_csv('../data/tmp/prediction_better_with_model.csv')

# Using YAMl

In [2]:
results = hg.run_with_config('../config/default_config.yaml')
results

[35m------ HYGIA ------[37m
------------------------------
[47m[30mRunning PRE PROCESSING...[40m[37m
aliases indified: [1mstreet -> [22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']
------------------------------
[47m[30mRunning FEATURE ENGINEERING...[40m[37m
[33mrunning feature engineering with configs below...[37m
[1mlanguage -> [22mes
[1mdimensions -> [22m25
extract features from -> street
[33mrunning feature engineering with configs below...[37m
[1mlanguage -> [22mes
[1mdimensions -> [22m25
extract features from -> STREET_ADDRESS_1


TypeError: expected string or bytes-like object