# Import libraries and data to add features.

In [1]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv('data/ags_data.csv', index_col=0)
df_clients = pd.read_csv('data/cliente_tabla.csv', index_col=0)
df = df.iloc[:, :-4]

# Name of product decomposition to important data

There are important features in the column 'NombreProducto' that can be decomposed into important data like amount of pieces per product, brand name and weight.

In [2]:
#Function to get just the product name from the column NombreProducto

#def get_product_name(df):
    #Buga's regex

Amount of pieces given product name

In [3]:
def df_add_piece_amount(df):
    pieces = df['NombreProducto'].str.extract(' (\d+)(p|pct)', expand=True)[0]
    pieces.fillna(1, inplace=True)
    df['PieceAmount'] = pieces.astype(int)
    return df

df = df_add_piece_amount(df)

Amount of weight given product name

In [4]:
# Function to get the weight of the product from the column NombreProducto

def df_add_weight_grams(df):
    weights = df['NombreProducto'].str.extract(r' (\d+)(ml|g|Kg|kg)', expand=True)
    #Tostada Ondulada Tubo is the only product with no weight in the name, searched for it and found it was 360g
    weights[0].fillna(360,inplace=True)
    weights[1].fillna('g',inplace=True)
    weights[0] = weights[0].astype(int)
    weights[1] = weights[1].astype(str)
    weights[1] = weights[1].str.lower()
    weights[0] = np.where(weights[1] == 'kg', weights[0]*1000, weights[0])

    df['WeightGrams'] = weights[0]
    return df

df = df_add_weight_grams(df)

Save new table with added features to a csv file.

In [5]:
df.to_csv('data/ags_data_processed.csv')

# Client grouping based on name and size.
Clients that are only in the state of importance.

In [6]:
# Clean cliente_tabla, only keep clients from aguascalientes, drop duplicates.
def get_clients_ags(df, df_clients):
    df_clients.reset_index(inplace=True)
    unique_clients_list = df['Cliente_ID'].unique()
    df_clients = df_clients[df_clients['Cliente_ID'].isin(unique_clients_list)]
    df_clients.drop_duplicates(subset='Cliente_ID', inplace=True, keep='first')
    return df_clients

df_clients = get_clients_ags(df, df_clients)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clients.drop_duplicates(subset='Cliente_ID', inplace=True, keep='first')


Client type grouping based on name of client.

In [7]:
# Separate clients into categories given their names.
def add_client_type(df_clients):
    df_clients['ClientType'] = np.nan
    
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*NO IDENTIFICADO.*', case=False)) & (df_clients['ClientType'].isna()), 'ClientType']\
                                                            = 'NI'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*WAL MART.*|.*SORIANA.*|.*LA COMER.*|.*SUPERAMA.*|.*AURRERA.*|.*CHEDRAUI.*|.*SUPERCENTER.*|.*COMERCIAL MEXICANA.*|.*COSTCO.*|.*SAMS.*|.*MI BODEGA.*', case=False))  & (df_clients['ClientType'].isna()), 'ClientType']\
                                                            = 'Big Store'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*ABARROT.*|.*ABTS.*|.*CREMERIA.*|.*MINI SUPER.*|.*FRUTERIA.*|.*CARNICERIA.*|.*VINOS.*|.*TIENDITA.*|.*PAPELERIA.*|.*LA FLOR.*', case=False)) & (df_clients['ClientType'].isna()),'ClientType']\
                                                            = 'Small Store'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*BODEGA.*|.*MERCADO.*|.*PLAZA.*|.*SUPER.*|.*MODELORAMA.*',case=False))  & (df_clients['ClientType'].isna()), 'ClientType']\
                                                            = 'Medium Store'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*COLEG.*|.*UNIV.*|.*ESCUELA.*|.*INSTI.*|.*PREPAR.*|.*SECUNDARIA.*|.*CBTIS.*', case=False))  & (df_clients['ClientType'].isna()), 'ClientType']\
                                                            = 'School'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*FARMACIA.*') & (df_clients['ClientType'].isna())), 'ClientType']\
                                                            = 'Pharmacy'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*CARLS JR.*|.*CAFE.*|.*BURGER.*|.*BURGUER.*|.*HAMBUR.*|.*PIZZA.*|.*LONCHES.*|.*GORDITAS.*|.*CARNITAS.*|.*BURRITOS.*|.*TACOS.*|.*TAQUERIA.*|.*LONCHERIA.*|.*LA MICHOACANA.*|.*JUGOS.*|.*LICUADOS.*|.*CHOCOS.*') & (df_clients['ClientType'].isna())), 'ClientType']\
                                                            = 'Restaurant'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*OXXO.*') & (df_clients['ClientType'].isna())), 'ClientType']\
                                                            = 'OXXO Store'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*CERESO.*') & (df_clients['ClientType'].isna())), 'ClientType']\
                                                            = 'Government'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*JATCO.*|.*PRIMERA PLUS.*') & (df_clients['ClientType'].isna())), 'ClientType']\
                                                            = 'Business'
    df_clients.loc[(df_clients.NombreCliente.str.contains('.*EXPENDIO.*|.*BIMBO.*') & (df_clients['ClientType'].isna())), 'ClientType']\
                                                            = 'Business'
    df_clients['ClientType'].fillna('Individual', inplace=True)
    
    return df_clients

df_clients = add_client_type(df_clients)
df_clients.head(5)

Unnamed: 0,Cliente_ID,NombreCliente,ClientType
2661,9200,CECILIA,Individual
2990,10433,ABARROTES ELIZABETH,Small Store
3021,10513,ABTS GONZALEZ,Small Store
3032,10533,ABARROTES LA PESADITA,Small Store
9418,26298,ABARROTES CHOLE,Small Store


Save new table with grouped features in a csv file.

In [8]:
df_clients.to_csv('data/clients_ags_processed.csv', index=False)