In [1]:
import pandas as pd
import numpy as np
import math
import os
import sys
import random
import string
import re
import nltk
import warnings
from tqdm import tqdm_notebook as tqdm
from tqdm.auto import tqdm as progress
from pandarallel import pandarallel
from matplotlib import pyplot as plt
from dstools.spark import init_spark2
import psutil

Model parameters:

In [2]:
tac_length = 8 
token_min = 2
rem_warnings = True

Resources management:

In [3]:
memory_usage = 0.1
kernels_usage = 0.1

Rules:

In [4]:
remove_memory = True 
test_tac_length = True 
tac_sims_rule = True 
rem_brack_content = True 
rem_repeats = True 
seperate_digits = True 
remove_vendors = True

Supress warnings:

In [5]:
if rem_warnings is True:
    warnings.filterwarnings('ignore')

Pandas progress:

In [6]:
progress.pandas()

Remove brackets content:

In [7]:
def remove_brackets(item):
    
    result = re.search(r'\([^)]*\)', item)
    p_item = item
    
    if result and re.search(',', str(result.group())):
        p_item = re.sub(r'\([^)]*\)', '', item)
        
    return p_item 

Remove vendor:

In [8]:
def remove_vendor(x):
    result = re.sub(r'(\b){}(\b)'.format(df[df['model_d_clean'] == x]['ven_d'].values[0]), '', x)
    return result 

Remove repeating tokens:

In [9]:
def remove_tokens(x):
    
    text_list = list(x.split())
    cleaned_list = []
    
    for item in text_list:
        if len(item) >= token_min and item not in cleaned_list:
            cleaned_list.append(item)   
        if len(item) < token_min:
            cleaned_list.append(item)

    return(' '.join(cleaned_list))

Read device catalog with features: 'tac', 'model_name', 'vendor_name', 'sim_count':

In [10]:
dev_features = ['TAC', 'MODEL_NAME', 'VENDOR_NAME', 'SIM_COUNT', 'TERMINAL_TYPE']
df = pd.read_csv('data/devices_dirty.csv')

Rename columns:

In [11]:
column_voc = {'TAC': 'tac_d', 'MODEL_NAME': 'model_d', 'VENDOR_NAME': 'ven_d', 'SIM_COUNT': 'sim_d', 'TERMINAL_TYPE': 'class_d'}
df = df.rename(columns=column_voc)

Change columns order:

In [12]:
df = df[['class_d', 'ven_d', 'model_d', 'tac_d', 'sim_d']]

Remove records with missed features:

In [13]:
df.dropna(subset=['class_d', 'ven_d', 'model_d', 'tac_d'], how='any', axis=0, inplace=True)

Convert types:

In [14]:
df = df.astype({'class_d': 'str', 'ven_d': 'str', 'model_d': 'str', 'tac_d': 'str'})
df['sim_d'] = df['sim_d'].fillna(-1).astype('int64').replace('-1', np.nan)

Create new model feature model_d_clean:

In [15]:
df['model_d_clean'] = df['model_d']

If contents of brackets of model name contain enumerations, contents is removed:

In [16]:
if rem_brack_content is True:
    df['model_d_clean'] = df['model_d_clean'].apply(remove_brackets)       

Remove punctuation from the model name:

In [17]:
df['model_d_clean'] = df['model_d_clean'].str.replace('[{}]'.format(string.punctuation), ' ')    

Remove multiple spaces and reduce to lowercase:

In [18]:
space_remove = lambda x: re.sub(' +', ' ', x) 
lower_case = lambda x: x.strip().lower() 

df[['class_d', 'ven_d', 'model_d_clean', 'tac_d']] = df[['class_d', 'ven_d', 'model_d_clean', 'tac_d']].applymap(space_remove)
df[['class_d', 'ven_d', 'model_d_clean', 'tac_d']] = df[['class_d', 'ven_d', 'model_d_clean', 'tac_d']].applymap(lower_case)

Drop duplicates of 'ven_d', 'model_d_clean', 'tac_d' vector:

In [19]:
df.drop_duplicates(inplace=True, keep='first', subset=['class_d', 'ven_d', 'model_d_clean', 'tac_d'])

Remove the memory size 'digit + GB' - do not influence tac:

In [20]:
if remove_memory is True:
    df['model_d_clean'] = df['model_d_clean'].apply(lambda x: re.sub(r'\d+gb', '', x))
    df['model_d_clean'] = df['model_d_clean'].apply(lambda x: re.sub(r'\d+гб', '', x))

Separate numbers from non-numbers:

In [21]:
if seperate_digits is True:
    df['model_d_clean'] = df['model_d_clean'].apply(lambda x: re.sub(r'([A-Za-zА-Яа-я])(\d)', r'\1 \2', x))
    df['model_d_clean'] = df['model_d_clean'].apply(lambda x: re.sub(r'(\d)([A-Za-zА-Яа-я])', r'\1 \2', x))

Remove vendors from model names:

In [22]:
workers = max(round(kernels_usage*psutil.cpu_count()), 1)
memory = max(round(memory_usage*psutil.virtual_memory().available/(1024**2)), 2048)
pandarallel.initialize(progress_bar=True, verbose=0, shm_size_mb=memory, nb_workers=workers)

if remove_vendors is True:
    df['model_d_clean'] = df['model_d_clean'].parallel_apply(remove_vendor) 



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15085), Label(value='0 / 15085')))…

Remove repeating tokens:

In [23]:
if rem_repeats is True: 
    df['model_d'] = df['model_d'].progress_apply(remove_tokens)

HBox(children=(IntProgress(value=0, max=165928), HTML(value='')))




Leave the entries with correct tac length:

In [24]:
if test_tac_length is True:
    df = df[df['tac_d'].apply(len) == tac_length]

Create a unique attribute 'ven_model_d':

In [25]:
df['ven_model_d'] = df['ven_d'] + ' ' + df['model_d_clean']

Add number of tac numbers per model to the data frame:

In [26]:
df_tac_num = df[['ven_model_d', 'tac_d']].groupby('ven_model_d', as_index=False).count().rename(columns={'tac_d': 'tac_num_d'})
df = pd.merge(df, df_tac_num, on=['ven_model_d'], how='inner')
df['tac_num_d'] = df['tac_num_d'].astype('int64')

Rule for programming tac code: n (tacs) <= n (sims). We exclude entries that violate this rule:

In [27]:
if tac_sims_rule is True:
    df = df[df['tac_num_d'] <= df['sim_d']]

Drop duplicates:

In [28]:
df.drop_duplicates(inplace=True, keep='first', subset=['class_d', 'ven_d', 'model_d_clean', 'tac_d'])

Drop auxiliary features:

In [29]:
df = df.drop(columns=['sim_d', 'tac_num_d', 'ven_model_d'])

Change the order of features:

In [30]:
df = df[['class_d', 'ven_d', 'tac_d', 'model_d', 'model_d_clean']]

Rename columns:

In [31]:
column_voc = {'tac_d': 'model_tac_devices', 'model_d': 'model_name_devices', 'model_d_clean': 'model_name_devices_clean', \
              'ven_d': 'vendor_devices', 'class_d': 'category_devices'}
df = df.rename(columns=column_voc)

Show header:

In [32]:
df.head()

Unnamed: 0,category_devices,vendor_devices,model_tac_devices,model_name_devices,model_name_devices_clean
298,plain,huawei,35495401,U550,u 550
747,smartphone,cat,35369909,S60,s 60
748,smartphone,cat,35813707,S60,s 60
773,smartphone,zte,86433801,Grand X Quad (V987),grand x quad v 987
784,smartphone,infocus,35512306,M330,m 330


Show random sample:

In [33]:
df.sample(n=5)

Unnamed: 0,category_devices,vendor_devices,model_tac_devices,model_name_devices,model_name_devices_clean
78017,smartphone,sharp,35658405,Aquos Phone Xx (SoftBank 302SH),aquos phone xx softbank 302 sh
90678,smartphone,samsung,35217106,Galaxy S5 Duos (SM-G9009D),galaxy s 5 duos sm g 9009 d
99900,smartphone,blu,35293507,Life X8,life x 8
139035,smartphone,karbonn,91146475,A90 3G,a 90 3 g
73131,plain,spice,91113560,QT53,qt 53


Load data to csv:

In [34]:
df.to_csv('data/devices_clean.csv', index=False)