In [3]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os
import sys
import random
import string
import re
from collections import OrderedDict
from tqdm import tqdm_notebook as tqdm

Parameters:

In [4]:
tac_length = 8

Read the catalog of devices with features: 'tac', 'model_name', 'vendor_name', 'sim_count', 'release_date':

In [5]:
dev_features = ['TAC', 'MODEL_NAME', 'VENDOR_NAME', 'SIM_COUNT']
df = pd.read_csv('data/devices_dirty.csv', usecols=dev_features)

Number of records before cleaning:

In [6]:
dim_before = int(df.shape[0])
print('Number of records:', int(df.shape[0]))

Number of records: 165963


Rename features:

In [7]:
column_voc = {'TAC': 'tac_d', 'VENDOR_NAME': 'ven_d', 'MODEL_NAME': 'model_d', 'SIM_COUNT': 'sim_d'}
df = df.rename(columns=column_voc)

Change order of features:

In [8]:
df = df[['ven_d', 'model_d', 'tac_d', 'sim_d']]

Show nulls for all features - relative values:

In [9]:
print('Show nulls for all features - relative values:', dict(round(df.isnull().sum()/len(df), 2)))

Show nulls for all features - relative values: {'ven_d': 0.0, 'model_d': 0.0, 'tac_d': 0.0, 'sim_d': 0.65}


Show nulls for all features - absolute values:

In [10]:
print('Show nulls for all features - absolute values:', dict(df.isnull().sum()))

Show nulls for all features - absolute values: {'ven_d': 0, 'model_d': 0, 'tac_d': 0, 'sim_d': 108350}


Drop records with missed records:

In [11]:
df.dropna(subset=['ven_d', 'model_d', 'tac_d'], how='any', axis=0, inplace=True)

Type conversion:

In [12]:
df = df.astype({'ven_d': 'str', 'model_d': 'str', 'tac_d': 'str'})

Removing punctuation from string attributes:

In [13]:
for item in tqdm(list(string.punctuation)):
    df['model_d'] = df['model_d'].apply(lambda x: x.replace(item, ' '))

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))




Remove multiple spaces and reduce to lowercase:

In [14]:
df = df.applymap(lambda x: re.sub(' +', ' ', x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

Drop_duplicates:

In [15]:
df.drop_duplicates(inplace=True, keep='first')

Delete duplicate words in the model name:

In [16]:
df['model_d'] = df['model_d'].apply(lambda x: ' '.join(OrderedDict.fromkeys(x.split())))

We remove manufacturers from model names:

In [17]:
for item in tqdm(list(set(df['ven_d']))):
    df['model_d'] = df['model_d'].apply(lambda x: x.replace(item, ''))

HBox(children=(IntProgress(value=0, max=6887), HTML(value='')))




We delete the characteristic of the memory size 'number + GB': it does not affect the tac number:

In [18]:
df['model_d'] = df['model_d'].apply(lambda x: re.sub(r'\d+gb', '', x))

Separate numbers from non-numbers:

In [19]:
df['model_d'] = df['model_d'].apply(lambda x: re.sub(r'(\D)(\d)', r'\1 \2', x))
df['model_d'] = df['model_d'].apply(lambda x: re.sub(r'(\d)(\D)', r'\1 \2', x))

Leave records with correct tac:

In [20]:
df = df[df['tac_d'].apply(len) == tac_length]

Delete entries with missing values:

In [21]:
df = df.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace('', np.nan)
df.dropna(subset=['model_d'], how='any', inplace=True)

Create new feature 'ven_model_d':  

In [22]:
df['ven_model_d'] = df['ven_d'] + ' ' + df['model_d']

Add feature with the number of tac numbers per model:

In [23]:
df_tac_num = df[['ven_model_d', 'tac_d']].groupby('ven_model_d', as_index=False).count().rename(columns={'tac_d': 'tac_num_d'})
df = pd.merge(df, df_tac_num, on=['ven_model_d'], how='inner')

Rule for tac code programming: n(tacs)<=n(sims). We exclude entries that violate this rule:

In [24]:
df = df[df['tac_num_d'] <= df['sim_d']]

Ambiguity tac => model:

In [25]:
df_tac = df[['ven_model_d', 'tac_d']].groupby('tac_d').count()
print('One tac corresponds to max', df_tac['ven_model_d'].max(), 'device(s)')

One tac corresponds to max 1 device(s)


Ambiguity model => tac:

In [26]:
df_mod = df[['ven_model_d', 'tac_d']].groupby('ven_model_d').count()
print('One device corresponds to max', round(df_mod['tac_d'].max()), 'tac')

One device corresponds to max 4 tac


Duplicate reset:

In [27]:
df.drop_duplicates(inplace=True, keep='first', subset=['model_d', 'tac_d'])

Drop auxiliary features:

In [28]:
df = df.drop(columns=['sim_d', 'tac_num_d', 'ven_model_d'])

Statistics on the distribution of the number of words in the model name:

In [29]:
print('Average number of words in model:', round(df['model_d'].str.split().apply(len).mean()))
print('Std number of words in model:', round(df['model_d'].str.split().apply(len).std()))

Average number of words in model: 5
Std number of words in model: 3


Number of records:

In [30]:
dim_after = int(df.shape[0])
print('Number of records:', int(df.shape[0]))

Number of records: 10156


Compression by the number of records:

In [31]:
print('Compression by the number of records:', round(dim_before/dim_after))

Compression by the number of records: 16


Frame description:

In [32]:
df.describe()[1:2][:]

Unnamed: 0,ven_d,model_d,tac_d
unique,237,9109,10156


Output a random sample:

In [33]:
df.sample(n=10)

Unnamed: 0,ven_d,model_d,tac_d
129678,karbonn,k 1,91000902
104808,intex,aqua extreme,91141570
91026,wiko,fever 4 g,35504807
8927,meizu,m 5 meilan 5 m 611 h,86305903
69422,huawei,e 5220 s 6,86947101
65111,zte,945,86448500
135478,huawei,p 10 plus vky al 00 1,86609103
140886,egltel,v 100 multiple v 300 v 400 700,35258804
47215,lava,a 48,91151330
146893,nec,525,35190700


Write csv to the directory:

In [34]:
df.to_csv('data/devices_clean.csv', index=False)