In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

df1 = pd.read_csv('./Working Code/Datasets/dataframe_finale.csv')
df1.drop('Unnamed: 0', axis=1, inplace=True)


In [2]:
# DELETE THE ROWS THAT HAVE A 'QTA' VALUE EQUAL TO ZERO  AND THE REMAINING NULL VALUES

df1 = df1[df1['QTA']!=0]
df1.dropna(inplace=True)

In [3]:
df1['PRODUCT_GROUP'].unique()

array(['CORE WEARABLES', 'WASHINGMACHINES FREESTANDING',
       'MOBILE COMPUTING', 'SMARTPHONES', 'PTV/FLAT'], dtype=object)

In [4]:
# WE DIVIDED THE FINAL DATAFRAME BY PRODUCT GROUP

core_wear = df1[df1['PRODUCT_GROUP']=='CORE WEARABLES']
wash = df1[df1['PRODUCT_GROUP']=='WASHINGMACHINES FREESTANDING']
pc = df1[df1['PRODUCT_GROUP']=='MOBILE COMPUTING']
smartphones = df1[df1['PRODUCT_GROUP']=='SMARTPHONES']
tv = df1[df1['PRODUCT_GROUP']=='PTV/FLAT']

In [5]:
gfk = pd.read_csv('./Working Code/Datasets/gfk_caratteristiche_training.csv')
gfk = gfk.dropna()  #DROP THE ROWS THAT HAVE ONLY NULL VALUES
gfk.replace('ND', np.nan, inplace=True)  # REPLACE 'ND' WITH NULL VALUES. WE WILL DEAL THESE VALUES LATER
gfk.drop('CARATTERISTICA_08', axis=1, inplace=True) #DROP THIS COLUMNS BEACUSE IS COMPOSED ONLY BY NULL VALUES

In [6]:
gfk.isnull().sum()

ITEM_ID                 0
CARATTERISTICA_01       0
CARATTERISTICA_02       0
CARATTERISTICA_05      77
CARATTERISTICA_04       1
CARATTERISTICA_03       1
CARATTERISTICA_06     235
CARATTERISTICA_07    1561
dtype: int64

# SMARTPHONES

In [8]:
# MERGE 'smartphones' AND 'gfk' BY 'ITEM_ID' AND CREATE A NEW COLUMN 'SCONTO_PERC_MEDIO_VOLANTINO'

smartphones = df1[df1['PRODUCT_GROUP']=='SMARTPHONES']
smartphones = pd.merge(smartphones, gfk, on='ITEM_ID', how='left')
smartphones['SCONTO_PERC_MEDIO_VOLANTINO'] = smartphones.groupby('CODICE_VOLANTINO')['SCONTO_PERC'].transform('mean')

## CAR_01

In [9]:
smartphones['CARATTERISTICA_01'].unique()

array(['DISPLAY SIZE 6.1', 'DISPLAY SIZE 6.5', 'DISPLAY SIZE 5.8',
       'DISPLAY SIZE 5.4', 'DISPLAY SIZE 6.7', nan, 'DISPLAY SIZE 4.7',
       'DISPLAY SIZE 5.5', 'DISPLAY SIZE 6.3', 'DISPLAY SIZE 5.2',
       'DISPLAY SIZE 6.2', 'DISPLAY SIZE 5.9', 'DISPLAY SIZE 5',
       'DISPLAY SIZE 5.99', 'DISPLAY SIZE 5.7', 'DISPLAY SIZE 5.3',
       'DISPLAY SIZE 6.55', 'DISPLAY SIZE 6.6', 'DISPLAY SIZE 6.53',
       'DISPLAY SIZE 6.52', 'DISPLAY SIZE 5.84', 'DISPLAY SIZE 6.21',
       'DISPLAY SIZE 5.93', 'DISPLAY SIZE 5.65', 'DISPLAY SIZE 6.59',
       'DISPLAY SIZE 6.67', 'DISPLAY SIZE 6.56', 'DISPLAY SIZE 6.74',
       'DISPLAY SIZE 6', 'DISPLAY SIZE 6.39', 'DISPLAY SIZE 5.1',
       'DISPLAY SIZE 6.15', 'DISPLAY SIZE 6.47', 'DISPLAY SIZE 6.4',
       'DISPLAY SIZE 5.71', 'DISPLAY SIZE 5.45', 'DISPLAY SIZE 6.09',
       'DISPLAY SIZE 6.26', 'DISPLAY SIZE 6.28', 'DISPLAY SIZE 4',
       'DISPLAY SIZE 6.71', 'DISPLAY SIZE 6.43', 'DISPLAY SIZE 6.72',
       'DISPLAY SIZE 6.57', 'DISPLAY SIZ

In [10]:
# DELETE 'TYPE GRAPHIC-DEVICE' BEACUSE IS CLEARLY AN ERROR

smartphones = smartphones[smartphones['CARATTERISTICA_01']!='TYPE GRAPHIC-DEVICE']

In [11]:
# RENAME THE COLUMN

smartphones = smartphones.rename(columns={'CARATTERISTICA_01': 'DISPLAY_SIZE'})

In [12]:
# TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'DISPLAY SIZE 6.22' --> 6.22)

smartphones['DISPLAY_SIZE'] = smartphones['DISPLAY_SIZE'].str.replace('DISPLAY SIZE ', '').astype(float)

In [13]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

smartphones['DISPLAY_SIZE'].fillna(smartphones['DISPLAY_SIZE'].value_counts().index[0], inplace=True)

## CAR_02

In [14]:
smartphones['CARATTERISTICA_02'].unique()

array(['OPERATOR SIM FREE', nan, 'OPERATOR WINDTRE', 'OPERATOR VODAFONE',
       'OPERATOR TIM', 'OPERATOR WIND'], dtype=object)

In [15]:
# RENAME THE COLUMN

smartphones = smartphones.rename(columns={'CARATTERISTICA_02': 'OPERATOR'})

In [16]:
# DELETE THE WORD 'OPERATOR'

smartphones['OPERATOR'] = smartphones['OPERATOR'].str.replace('OPERATOR ', '').astype(str)

In [17]:
# FILL NULL VALUES WITH 'SIM FREE' AND REPLACE 'WIND' WITH 'WINDTRE' BECAUSE THEY REPRESENT THE SAME OPERATOR

smartphones['OPERATOR'].fillna('SIM FREE', inplace=True)

smartphones['OPERATOR'].replace('WIND', 'WINDTRE', inplace=True)

## CAR_03

In [21]:
smartphones['CARATTERISTICA_03'].unique()

array(['GENERATION TOTAL* 4.X G', 'GENERATION TOTAL* 5.X G', nan,
       'CAPACITY 2716', 'CAPACITY 4100', 'CAPACITY 4130', 'CAPACITY 3000',
       'CAPACITY 3300', 'CAPACITY 2650', 'CAPACITY 5000',
       'GENERATION TOTAL* 3.X G', 'CAPACITY 2800', 'CAPACITY 2500',
       'CAPACITY 3340', 'CAPACITY 4000', 'CAPACITY 3200', 'CAPACITY 3750',
       'CAPACITY 3800', 'CAPACITY 3500', 'CAPACITY 3174', 'CAPACITY 2600',
       'CAPACITY 3600', 'CAPACITY 2700', 'CAPACITY 1624', 'CAPACITY 2658',
       'CAPACITY 3020', 'CAPACITY 2900', 'CAPACITY 1810', 'CAPACITY 3080',
       'CAPACITY 2675', 'CAPACITY 2620'], dtype=object)

There are errors because the values 'CAPACITY' should be in the next column. I then select all the rows that contain the word 'CAPACITY' in the 'CARATTERISTICA_03' column and then:

-move the values from the 'CARATTERISTICA_04' column to the 'CARATTERISTICA_05' column

-Move the values from the 'CARATTERISTICA_03' column to 'CARATTERISTICA_04' column.

In [22]:
smartphones['CARATTERISTICA_03'] = smartphones['CARATTERISTICA_03'].fillna('')
righe_errate = smartphones[smartphones['CARATTERISTICA_03'].str.contains('CAPACITY')]
smartphones.loc[righe_errate.index, 'CARATTERISTICA_05'] = smartphones.loc[righe_errate.index, 'CARATTERISTICA_04']
smartphones['CARATTERISTICA_04'].replace(r'.*SIM.*', np.nan, regex=True, inplace=True)
smartphones.loc[righe_errate.index, 'CARATTERISTICA_04'] = smartphones.loc[righe_errate.index, 'CARATTERISTICA_03']
smartphones['CARATTERISTICA_03'].replace(r'.*CAPACITY.*', np.nan, regex=True, inplace=True)
smartphones['CARATTERISTICA_03'].replace('', np.nan, inplace=True)

In [23]:
smartphones['CARATTERISTICA_03'].unique()

array(['GENERATION TOTAL* 4.X G', 'GENERATION TOTAL* 5.X G', nan,
       'GENERATION TOTAL* 3.X G'], dtype=object)

In [24]:
# RENAME THE COLUMN

smartphones = smartphones.rename(columns={'CARATTERISTICA_03': 'GENERATION'})

In [25]:
# WE DEFINED A FUNCTION TO EXTRACT ONLY THE NUMBER (EX. 'GENERATION TOTAL* 4.X G' --> 4)

import re

def estrai_numero(testo):
    if isinstance(testo, str): 
        match = re.search(r'\d+\.*\d*', testo)
        if match:
            return float(match.group())
    return np.nan  

smartphones['GENERATION'] = smartphones['GENERATION'].apply(estrai_numero)


In [26]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

smartphones['GENERATION'].fillna(smartphones['GENERATION'].value_counts().index[0], inplace=True)

## CAR_04

In [27]:
smartphones['CARATTERISTICA_04'].unique()

array(['CAPACITY 3110', 'CAPACITY 3046', 'CAPACITY 3969', 'CAPACITY 3190',
       'CAPACITY 2815', 'CAPACITY 2227', 'CAPACITY 3687', 'CAPACITY 3240',
       'CAPACITY 2438', 'CAPACITY 4352', nan, 'CAPACITY 3095',
       'CAPACITY 3279', 'CAPACITY 4323', 'CAPACITY 3200', 'CAPACITY 3349',
       'CAPACITY 4383', 'CAPACITY 2018', 'CAPACITY 1821', 'CAPACITY 2716',
       'CAPACITY 4000', 'CAPACITY 5000', 'CAPACITY 4100', 'CAPACITY 4130',
       'CAPACITY 3000', 'CAPACITY 3300', 'CAPACITY 2650', 'CAPACITY 2000',
       'CAPACITY 2800', 'CAPACITY 2200', 'CAPACITY 4385', 'CAPACITY 4575',
       'CAPACITY 5050', 'CAPACITY 2500', 'CAPACITY 3500', 'CAPACITY 3400',
       'CAPACITY 3100', 'CAPACITY 3700', 'CAPACITY 4500', 'CAPACITY 3340',
       'CAPACITY 5100', 'CAPACITY 5200', 'CAPACITY 4200', 'CAPACITY 4300',
       'CAPACITY 3750', 'CAPACITY 3650', 'CAPACITY 3020', 'CAPACITY 4020',
       'CAPACITY 3800', 'CAPACITY 2942', 'CAPACITY 3174', 'CAPACITY 4820',
       'CAPACITY 5260', 'CAPACITY 502

In [28]:
# RENAME THE COLUMN

smartphones = smartphones.rename(columns={'CARATTERISTICA_04': 'CAPACITY'})

In [29]:
# TAKE JUST THE INTEGER PART OF THE VALUES (EX. 'CAPACITY 4310' --> 4310)

smartphones['CAPACITY'] = smartphones['CAPACITY'].str.replace('CAPACITY ', '').astype(float)

In [30]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

smartphones['CAPACITY'].fillna(smartphones['CAPACITY'].value_counts().index[0], inplace=True)

## CAR_05

In [31]:
smartphones['CARATTERISTICA_05'].unique()

array(['SIM CARD 1SIM SLOT +ESIM', nan, 'SIM CARD 1SIM SLOT',
       'SIM CARD 2SIM SLOTS', 'SIM CARD 2SIM SLOTS+ESIM'], dtype=object)

In [32]:
# RENAME THE COLUMN

smartphones = smartphones.rename(columns={'CARATTERISTICA_05': 'NUMBER_OF_SIM'})

In [33]:
# REPLACE WITH THE NUMBER OF SIM AND ESIM 

smartphones['NUMBER_OF_SIM'].replace('SIM CARD 1SIM SLOT +ESIM', 2, inplace=True)
smartphones['NUMBER_OF_SIM'].replace('SIM CARD 1SIM SLOT', 1, inplace=True)
smartphones['NUMBER_OF_SIM'].replace('SIM CARD 2SIM SLOTS', 2, inplace=True)
smartphones['NUMBER_OF_SIM'].replace('SIM CARD 2SIM SLOTS+ESIM', 3, inplace=True)

In [34]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

smartphones['NUMBER_OF_SIM'].fillna(smartphones['NUMBER_OF_SIM'].value_counts().index[0], inplace=True)

## CAR_06

In [35]:
smartphones['CARATTERISTICA_06'].unique()

array(['OPERATING SYST. IOS', nan, 'OPERATING SYST. ANDROID'],
      dtype=object)

In [36]:
# RENAME THE COLUMN

smartphones = smartphones.rename(columns={'CARATTERISTICA_06': 'OPERATING_SYST'})

In [37]:
# DELETE 'OPERATOR SYST. ' IN ORDER TO HAVE JUST 'IOS' OR 'ANDROID'

smartphones['OPERATING_SYST'] = smartphones['OPERATING_SYST'].str.replace('OPERATING SYST. ', '')

In [38]:
# FILL NULL VALUES WITH 'IOS' IF 'ART_COD' STARTS WITH 'APL', OTHERWHISE 'ANDROID'

def sostituisci_nan(row):
    if pd.isnull(row['OPERATING_SYST']):  
        if row['ART_COD'].startswith('APL'):  
            return 'IOS'
        else:
            return 'ANDROID'
    else:
        return row['OPERATING_SYST']  


smartphones['OPERATING_SYST'] = smartphones.apply(sostituisci_nan, axis=1)

## CAR_07

In [39]:
# THIS COLUMN IS EMPTY

smartphones.drop('CARATTERISTICA_07', axis=1, inplace = True)

In [484]:
#smartphones.to_csv('/Users/vincenzocamerlengo/Desktop/Data Science in Action/Unieuro Dataset-20240305/dati_luiss_2/smartphones.csv')

# CORE WEAR

In [40]:
# MERGE 'core_wear' AND 'gfk' BY 'ITEM_ID' AND CREATE A NEW COLUMN 'SCONTO_PERC_MEDIO_VOLANTINO'

core_wear = df1[df1['PRODUCT_GROUP']=='CORE WEARABLES']
core_wear = pd.merge(core_wear, gfk, on='ITEM_ID', how='left')
core_wear['SCONTO_PERC_MEDIO_VOLANTINO'] = core_wear.groupby('CODICE_VOLANTINO')['SCONTO_PERC'].transform('mean')

## CAR_01

In [41]:
core_wear['CARATTERISTICA_01'].unique()

array(['CATEGORY SMARTWATCHES NO SIM', 'CATEGORY WRIST SPORT COMPUTER',
       nan, 'CATEGORY SMARTWATCHES SIM',
       'CATEGORY HEALTH AND FITNESS TRACKER'], dtype=object)

In [42]:
# RENAME THE COLUMN

core_wear = core_wear.rename(columns={'CARATTERISTICA_01': 'CATEGORY'})

In [43]:
# REMOVE THE WORD 'CATEGORY'

core_wear['CATEGORY'] = core_wear['CATEGORY'].str.replace('CATEGORY ', '')

In [44]:
# FILL THE NULL VALUES WITH A NEW CLASS, CALLED 'NOT DEFINED'

core_wear['CATEGORY'].replace(np.nan, 'NOT DEFINED', inplace=True)

## CAR_02

In [45]:
core_wear['CARATTERISTICA_02'].unique()

array(['DISPLAY TECHNOL OLED-DISPLAY', 'DISPLAY TECHNOL LCD-DISPLAY', nan,
       'DISPLAY TECHNOL ELECTRONIC INK'], dtype=object)

In [46]:
# RENAME THE COLUMN

core_wear = core_wear.rename(columns={'CARATTERISTICA_02': 'DISPLAY_QUALITY'})

In [47]:
# REMOVE THE WORD 'DISPLAY TECHNOL '

core_wear['DISPLAY_QUALITY'] = core_wear['DISPLAY_QUALITY'].str.replace('DISPLAY TECHNOL ', '')

In [48]:
# FILL THE NULL VALUES WITH A NEW CLASS, CALLED 'NOT DEFINED'

core_wear['DISPLAY_QUALITY'].replace(np.nan, 'NOT DEFINED', inplace=True)

## CAR_03

In [49]:
core_wear['CARATTERISTICA_03'].unique()

array(['DISPLAY SIZE 1.47', 'DISPLAY SIZE 1.69', 'DISPLAY SIZE 1.28',
       'DISPLAY SIZE 1.43', 'DISPLAY SIZE 1.39', nan, 'DISPLAY SIZE 1.45',
       'DISPLAY SIZE 1.2', 'DISPLAY SIZE 1.65', 'DISPLAY SIZE 1.55',
       'DISPLAY SIZE 1.75', 'DISPLAY SIZE 1.3', 'DISPLAY SIZE 1.57',
       'DISPLAY SIZE 1.78', 'DISPLAY SIZE 1.5', 'DISPLAY SIZE 1.9',
       'DISPLAY SIZE 1.92', 'DISPLAY SIZE UNKNOWN', 'DISPLAY SIZE 1.54',
       'DISPLAY SIZE 1.04', 'DISPLAY SIZE 1.58', 'DISPLAY SIZE 1.32',
       'DISPLAY SIZE 1.38', 'DISPLAY SIZE 0.73', 'DISPLAY SIZE 0.95',
       'DISPLAY SIZE 1.74', 'DISPLAY SIZE 1.64', 'DISPLAY SIZE 1.1',
       'DISPLAY SIZE 1.83', 'DISPLAY SIZE 1.4', 'DISPLAY SIZE 1.19',
       'DISPLAY SIZE 1.62', 'DISPLAY SIZE 0.9', 'DISPLAY SIZE 1.41',
       'DISPLAY SIZE 0.74', 'DISPLAY SIZE 0.42', 'DISPLAY SIZE 0.78',
       'DISPLAY SIZE 1.56'], dtype=object)

In [50]:
# RENAME THE COLUMN

core_wear = core_wear.rename(columns={'CARATTERISTICA_03': 'DISPLAY_SIZE'})

In [51]:
# REPLACE 'DISPLAY SIZE UNKNOWN' WITH NULL VALUE

core_wear['DISPLAY_SIZE'].replace('DISPLAY SIZE UNKNOWN', np.nan, inplace=True)

In [52]:
# TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'DISPLAY SIZE 1.74' --> 1.74)

core_wear['DISPLAY_SIZE'] = core_wear['DISPLAY_SIZE'].str.replace('DISPLAY SIZE ', '').astype(float)

In [53]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

core_wear['DISPLAY_SIZE'].fillna(core_wear['DISPLAY_SIZE'].value_counts().index[0], inplace=True)

## CAR_04

In [54]:
core_wear['CARATTERISTICA_04'].unique()

array(['WIFI NO', nan, 'WIFI YES'], dtype=object)

In [55]:
# RENAME THE COLUMN

core_wear = core_wear.rename(columns={'CARATTERISTICA_04': 'WIFI'})

In [56]:
# REPLACE 'WIFI NO' AND NULL VALUES WITH 0 AND 'WIFI YES' WITH 1

core_wear['WIFI'].replace(np.nan, 0, inplace=True)
core_wear['WIFI'].replace('WIFI NO', 0, inplace=True)
core_wear['WIFI'].replace('WIFI YES', 1, inplace=True)

## CAR_05

In [57]:
core_wear['CARATTERISTICA_05'].unique()

array(['BLUETOOTH WITH BLUETOOTH', nan], dtype=object)

In [58]:
# RENAME THE COLUMN

core_wear = core_wear.rename(columns={'CARATTERISTICA_05': 'BLUETOOTH'})

In [59]:
# REPLACE NULL VALUES WITH 0 AND 'BLUETOOTH WITH BLUETOOTH' WITH 1

core_wear['BLUETOOTH'].replace(np.nan, 0, inplace=True)
core_wear['BLUETOOTH'].replace('BLUETOOTH WITH BLUETOOTH', 1, inplace=True)

## CAR_06 & CAR_07

In [60]:
# THESE TWO COLUMNS ARE EMPTY

core_wear.drop(['CARATTERISTICA_06', 'CARATTERISTICA_07'], axis=1, inplace=True)

In [507]:
#core_wear.to_csv('/Users/vincenzocamerlengo/Desktop/Data Science in Action/Unieuro Dataset-20240305/dati_luiss_2/core_wear.csv')

# WASH

In [61]:
# MERGE 'wash' AND 'gfk' BY 'ITEM_ID' AND CREATE A NEW COLUMN 'SCONTO_PERC_MEDIO_VOLANTINO'

wash = df1[df1['PRODUCT_GROUP']=='WASHINGMACHINES FREESTANDING']
wash = pd.merge(wash, gfk, on='ITEM_ID', how='left')
wash['SCONTO_PERC_MEDIO_VOLANTINO'] = wash.groupby('CODICE_VOLANTINO')['SCONTO_PERC'].transform('mean')

## CAR_01

In [62]:
wash['CARATTERISTICA_01'].unique()

array(['MAIN TYPES FULLAUTOMATIC', nan, 'MAIN TYPES WASHDRYER',
       'CONSTR.2 BUILT IN/UNDER', 'TYPE OF HOOD CHIMNEY/DECORAT'],
      dtype=object)

In [63]:
# DELETE 'TYPE OF HOOD CHIMNEY/DECORAT' AND 'CONSTR.2 BUILT IN/UNDER' BEACUSE ARE CLEARLY ERRORS

wash = wash[(wash['CARATTERISTICA_01']!='TYPE OF HOOD CHIMNEY/DECORAT') & (wash['CARATTERISTICA_01']!='CONSTR.2 BUILT IN/UNDER')]

In [64]:
# RENAME THE COLUMN

wash = wash.rename(columns={'CARATTERISTICA_01': 'TYPE'})

In [65]:
# REPLACE 'MAIN TYPES WASHDRYER' WITH 0 AND 'MAIN TYPES FULLAUTOMATIC' WITH 1. THEN FILL NULL VALUES WITH THE MOST FREQUENT CLASS

wash['TYPE'].replace('MAIN TYPES FULLAUTOMATIC', 1, inplace=True)
wash['TYPE'].replace('MAIN TYPES WASHDRYER', 0, inplace=True)
wash['TYPE'].fillna(wash['TYPE'].value_counts().index[0], inplace=True)

## CAR_02

In [66]:
wash['CARATTERISTICA_02'].unique()

array(['LOADING FRONTLOADING', nan, 'LOADING TOPLOADING'], dtype=object)

In [67]:
# RENAME THE COLUMN

wash = wash.rename(columns={'CARATTERISTICA_02': 'TOPLOADING'})

In [68]:
# REPLACE 'LOADING FRONTLOADING' WITH 0 AND 'LOADING TOPLOADING' WITH 1. THEN FILL NULL VALUES WITH THE MOST FREQUENT CLASS

wash['TOPLOADING'].replace('LOADING TOPLOADING', 1, inplace=True)
wash['TOPLOADING'].replace('LOADING FRONTLOADING', 0, inplace=True)
wash['TOPLOADING'].fillna(wash['TOPLOADING'].value_counts().index[0], inplace=True)

## CAR_03

In [69]:
wash['CARATTERISTICA_03'].unique()

array(['DEPTH IN CM STANDARD > 48 CM', nan, 'DEPTH IN CM SLIM <= 48 CM'],
      dtype=object)

In [70]:
# RENAME THE COLUMN

wash = wash.rename(columns={'CARATTERISTICA_03': 'DEPTH_CM>48'})

In [71]:
# REPLACE 'DEPTH IN CM SLIM <= 48 CM' WITH 0 AND 'DEPTH IN CM STANDARD > 48 CM' WITH 1. THEN FILL NULL VALUES WITH THE MOST FREQUENT CLASS

wash['DEPTH_CM>48'].replace('DEPTH IN CM STANDARD > 48 CM', 1, inplace=True)
wash['DEPTH_CM>48'].replace('DEPTH IN CM SLIM <= 48 CM', 0, inplace=True)
wash['DEPTH_CM>48'].fillna(wash['DEPTH_CM>48'].value_counts().index[0], inplace=True)

## CAR_04

In [72]:
wash['CARATTERISTICA_04'].unique()

array(['21EN.EFF.CLASS A', '21EN.EFF.CLASS D', nan, '21EN.EFF.CLASS N.A.',
       '21EN.EFF.CLASS C', 'ENERGY LABEL EU A+++',
       '21EN.EFF.CLASS UNKNOWN', '21EN.EFF.CLASS B', '21EN.EFF.CLASS E',
       'ENERGY LABEL EU A', '21EN.EFF.CLASS F', 'ENERGY LABEL EU A++'],
      dtype=object)

In [73]:
# RENAME THE COLUMN

wash = wash.rename(columns={'CARATTERISTICA_04': 'ENERGY_CLASS'})

In [74]:
# REPLACE '21EN.EFF.CLASS N.A.' AND '21EN.EFF.CLASS UNKNOWN' WITH NULL VALUE

wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS N.A.', np.nan, inplace=True)
wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS UNKNOWN', np.nan, inplace=True)

In [75]:
wash['ENERGY_CLASS'].unique()

array(['21EN.EFF.CLASS A', '21EN.EFF.CLASS D', nan, '21EN.EFF.CLASS C',
       'ENERGY LABEL EU A+++', '21EN.EFF.CLASS B', '21EN.EFF.CLASS E',
       'ENERGY LABEL EU A', '21EN.EFF.CLASS F', 'ENERGY LABEL EU A++'],
      dtype=object)

In [76]:
# REPLACE EACH STRING WITH A NUMBER, ACCORDING TO THEIR ENERGY CLASS

wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS F', 1, inplace=True)
wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS E', 2, inplace=True)
wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS D', 3, inplace=True)
wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS C', 4, inplace=True)
wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS B', 5, inplace=True)
wash['ENERGY_CLASS'].replace('21EN.EFF.CLASS A', 6, inplace=True)
wash['ENERGY_CLASS'].replace('ENERGY LABEL EU A', 3, inplace=True)
wash['ENERGY_CLASS'].replace('ENERGY LABEL EU A++', 5, inplace=True)
wash['ENERGY_CLASS'].replace('ENERGY LABEL EU A+++', 6, inplace=True)

In [77]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

wash['ENERGY_CLASS'].fillna(wash['ENERGY_CLASS'].value_counts().index[0], inplace=True)

## CAR_05

In [78]:
wash['CARATTERISTICA_05'].unique()

array(['LOADING KG 10', 'LOADING KG 8', nan, 'LOADING KG 9',
       'LOADING KG 7', 'LOADING KG 6', 'LOADING KG 8.5', 'LOADING KG 12',
       'LOADING KG 10.5', 'LOADING KG 11', 'LOADING KG 6.5',
       'LOADING KG 14', 'LOADING KG 5'], dtype=object)

In [79]:
# RENAME THE COLUMN

wash = wash.rename(columns={'CARATTERISTICA_05': 'LOADING_KG'})

In [80]:
# TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'LOADING KG 8.5' --> 8.5)

wash['LOADING_KG'] = wash['LOADING_KG'].str.replace('LOADING KG ', '').astype(float)

In [81]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

wash['LOADING_KG'].fillna(wash['LOADING_KG'].value_counts().index[0], inplace=True)

## CAR_06

In [82]:
wash['CARATTERISTICA_06'].unique()

array(['SMART CONNECT NO', nan, 'SMART CONNECT SMART CONNECT'],
      dtype=object)

In [83]:
# RENAME THE COLUMN

wash = wash.rename(columns={'CARATTERISTICA_06': 'SMART_CONNECT'})

In [84]:
# REPLACE 'SMART CONNECT NO' AND NULL VALUES WITH 0 AND 'SMART CONNECT SMART CONNECT' WITH 1

wash['SMART_CONNECT'].replace(np.nan, 0, inplace=True)
wash['SMART_CONNECT'].replace('SMART CONNECT NO', 0, inplace=True)
wash['SMART_CONNECT'].replace('SMART CONNECT SMART CONNECT', 1, inplace=True)

## CAR_07

In [85]:
# THIS COLUMN IS EMPTY

wash.drop('CARATTERISTICA_07', axis=1, inplace=True)

In [533]:
#wash.to_csv('/Users/vincenzocamerlengo/Desktop/Data Science in Action/Unieuro Dataset-20240305/dati_luiss_2/wash.csv')

# PC

In [7]:
# MERGE 'pc' AND 'gfk' BY 'ITEM_ID' AND CREATE A NEW COLUMN 'SCONTO_PERC_MEDIO_VOLANTINO'

pc = df1[df1['PRODUCT_GROUP']=='MOBILE COMPUTING']
pc = pd.merge(pc, gfk, on='ITEM_ID', how='left')
pc['SCONTO_PERC_MEDIO_VOLANTINO'] = pc.groupby('CODICE_VOLANTINO')['SCONTO_PERC'].transform('mean')

## CAR_01

In [8]:
pc['CARATTERISTICA_01'].unique()

array(['SEGMENTS NOTEBOOK', 'GPU MODEL INTEL UHD GPU',
       'GPU MODEL RAD R5 APU', 'GPU MODEL RAD 610M',
       'GPU MODEL UHD GRAPH. 605', 'GPU MODEL RAD VEGA 8',
       'GPU MODEL RAD VEGA 3', 'GPU MODEL GEF MX130',
       'GPU MODEL UHD GRAPH. 620', 'GPU MODEL HD GRAPH. 620',
       'GPU MODEL INTEL IRIS PLUS', 'GPU MODEL RAD',
       'GPU MODEL INTEL IRIS XE', 'GPU MODEL GEF MX450',
       'GPU MODEL GEF GTX1650', 'GPU MODEL GEF RTX3050',
       'GPU MODEL GEF RTX2060', 'GPU MODEL GEF RTX3060',
       'GPU MODEL UHD GRAPH. 600', nan, 'GPU MODEL APPLE 7CORE GPU',
       'GPU MODEL APPLE 8CORE GPU', 'GPU MODEL APPLE 10COREGPU',
       'GPU MODEL IRIS PLUS 655', 'GPU MODEL HD GRAPH. 6000',
       'GPU MODEL UHD GRAPH. 617', 'GPU MODEL IRIS PLUS 645',
       'GPU MODEL RAD PRO 5300M', 'GPU MODEL HD GRAPH. 500',
       'GPU MODEL HD GRAPH. 400', 'GPU MODEL GEF MX230',
       'GPU MODEL GEF MX110', 'GPU MODEL GEF MX330',
       'GPU MODEL RAD R4 APU', 'GPU MODEL HD GRAPH. 510',
      

In [9]:
pc = pc[pc['CARATTERISTICA_01'].str.contains('GPU', na=True)] #DELETE THE VALUES THAT DOES NOT CONTAIN THE WORD 'GPU'
pc.reset_index(inplace=True)
pc.drop('index', axis=1, inplace=True)

In [10]:
# DELETE THW WORD 'GPU MODEL '

pc['CARATTERISTICA_01'] = pc['CARATTERISTICA_01'].str.replace('GPU MODEL ', '')

In [11]:
# FILL THE NULL VALUES WITH A NEW CLASS, CALLED 'NOT DEFINED'

pc['CARATTERISTICA_01'].fillna('NOT DEFINED', inplace=True)

INTEL produce: UHD, HD, IRIS, MALI-G72, ADRENO506

With the following loop, we add the word 'INTEL' before the gpu model produced by them

In [12]:
for i in range(pc.shape[0]):
    if 'UHD' in pc.iloc[i, 23] and 'INTEL' not in pc.iloc[i, 23]:
        pc.iloc[i, 23] = 'INTEL'+' '+pc.iloc[i, 23]
    elif 'HD' in pc.iloc[i, 23] and 'INTEL' not in pc.iloc[i, 23]:
        pc.iloc[i, 23] = 'INTEL'+' '+pc.iloc[i, 23]
    elif 'IRIS' in pc.iloc[i, 23] and 'INTEL' not in pc.iloc[i, 23]:
        pc.iloc[i, 23] = 'INTEL'+' '+pc.iloc[i, 23]
    elif 'MALI-G72' in pc.iloc[i, 23] and 'INTEL' not in pc.iloc[i, 23]:
        pc.iloc[i, 23] = 'INTEL'+' '+pc.iloc[i, 23]
    elif 'ADRENO506' in pc.iloc[i, 23] and 'INTEL' not in pc.iloc[i, 23]:
        pc.iloc[i, 23] = 'INTEL'+' '+pc.iloc[i, 23]

In [13]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_01': 'GPU_MODEL'})

In [14]:
pc['GPU_MODEL'].value_counts()

GPU_MODEL
INTEL IRIS XE          225
INTEL UHD GPU          203
RAD                    133
APPLE 7CORE GPU        132
APPLE 8CORE GPU         97
                      ... 
INTEL ADRENO506          1
INTEL HD GRAPH. 505      1
GEF 940MX                1
RAD VEGA 6               1
INTEL HD GRAPH. 510      1
Name: count, Length: 73, dtype: int64

In [15]:
pc['GPU_MODEL'].unique()

array(['INTEL UHD GPU', 'RAD R5 APU', 'RAD 610M', 'INTEL UHD GRAPH. 605',
       'RAD VEGA 8', 'RAD VEGA 3', 'GEF MX130', 'INTEL UHD GRAPH. 620',
       'INTEL HD GRAPH. 620', 'INTEL IRIS PLUS', 'RAD', 'INTEL IRIS XE',
       'GEF MX450', 'GEF GTX1650', 'GEF RTX3050', 'GEF RTX2060',
       'GEF RTX3060', 'INTEL UHD GRAPH. 600', 'NOT DEFINED',
       'APPLE 7CORE GPU', 'APPLE 8CORE GPU', 'APPLE 10COREGPU',
       'INTEL IRIS PLUS 655', 'INTEL HD GRAPH. 6000',
       'INTEL UHD GRAPH. 617', 'INTEL IRIS PLUS 645', 'RAD PRO 5300M',
       'INTEL HD GRAPH. 500', 'INTEL HD GRAPH. 400', 'GEF MX230',
       'GEF MX110', 'GEF MX330', 'RAD R4 APU', 'INTEL HD GRAPH. 510',
       'RAD R7 APU', 'GEF GTX1050TI', 'GEF GTX1050', 'GEF MX350',
       'INTEL HD GRAPH. 610', 'RAD R5 M420', 'GEF 930MX',
       'INTEL ADRENO506', 'GEF MX150', 'GEF GTX1650TI', 'RAD 520',
       'RAD 530', 'GEF 940MX', 'GEF MX250', 'RAD VEGA 6', 'GEF GTX1660TI',
       'GEF GTX1660', 'GEF RTX2070 S.M', 'RAD RX VEGA 10', 'RAD 

In [16]:
# WITH THIS LOOP, WE CREATE NEW COLUMNS IN ORDER TO ENCODE THE GPU MODELS BASED ON THEIR VERSION

INTEL = []
APPLE = []
RAD = []
GEF = [] 
NOT_DEFINED = []

for i in range(pc.shape[0]):
    if 'NOT DEFINED' == pc.iloc[i, 23]:
        NOT_DEFINED.append(1)
        INTEL.append(0)
        APPLE.append(0)
        RAD.append(0)
        GEF.append(0)
    elif 'INTEL' in pc.iloc[i, 23]:
        NOT_DEFINED.append(0)
        INTEL.append(1)
        APPLE.append(0)
        RAD.append(0)
        GEF.append(0)
    elif 'APPLE' in pc.iloc[i, 23]:
        NOT_DEFINED.append(0)
        INTEL.append(0)
        APPLE.append(1)
        RAD.append(0)
        GEF.append(0)
    elif 'RAD' in pc.iloc[i, 23]:
        NOT_DEFINED.append(0)
        INTEL.append(0)
        APPLE.append(0)
        RAD.append(1)
        GEF.append(0)
    elif 'GEF' in pc.iloc[i, 23]:
        NOT_DEFINED.append(0)
        INTEL.append(0)
        APPLE.append(0)
        RAD.append(0)
        GEF.append(1)

pc['VERSION_NOT_DEFINED'] = NOT_DEFINED
pc['VERSION_INTEL'] = INTEL
pc['VERSION_APPLE'] = APPLE
pc['VERSION_RAD'] = RAD
pc['VERSION_GEF'] = GEF

In [17]:
pc.head()

Unnamed: 0,CODICE_VOLANTINO,DATA_INIZIO,DATA_FINE,NOME_CAMPAGNA,ART_COD,PREZZO_PROMO,PREZZO_LISTINO,STOCK_PZ,TIPOLOGIA_PRODOTTO,SCONTO_PERC,BRAND,ITEM_ID,KPI_1,KPI_2,KPI_3,KPI_4,KPI_5,PRODUCT_GROUP,QTA,FATTURATO,QTA_storico,FATTURATO_storico,DURATA_VOLANTINO_IN_GIORNI,GPU_MODEL,CARATTERISTICA_02,CARATTERISTICA_05,CARATTERISTICA_04,CARATTERISTICA_03,CARATTERISTICA_06,CARATTERISTICA_07,SCONTO_PERC_MEDIO_VOLANTINO,VERSION_NOT_DEFINED,VERSION_INTEL,VERSION_APPLE,VERSION_RAD,VERSION_GEF
0,22GC,2022-07-11,2022-07-21,UN MARE DI SCONTI A TASSO ZERO,ACEA11532C64E,249.9,359.9,2028.0,Principale,31.0,ACER,172229207.0,0.656697,0.0,0.0,0.32269,0.0,MOBILE COMPUTING,188.0,38403.3,0,0.0,10,INTEL UHD GPU,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,25.37037,0,1,0,0,0
1,23DB,2023-04-28,2023-05-11,PASSIONE CASA,ACEA11532C64E,269.9,349.9,1217.0,Principale,22.86,ACER,172229207.0,0.528941,0.489399,0.633822,0.322895,0.123499,MOBILE COMPUTING,203.0,44754.8,854,190473.36,13,INTEL UHD GPU,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,25.475714,0,1,0,0,0
2,23LB,2023-10-16,2023-10-22,BASKET,ACEA11532C64E,269.9,349.9,1917.0,Principale,22.86,ACER,172229207.0,0.914982,0.547895,0.213179,0.207078,1.0,MOBILE COMPUTING,251.0,55181.62,119,27877.12,6,INTEL UHD GPU,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,25.476111,0,1,0,0,0
3,23CB,2023-03-17,2023-03-30,BASKET MARZO,ACEA11532C64E,249.9,349.9,438.0,Principale,28.58,ACER,172229207.0,0.610293,0.443426,0.345346,0.230252,1.0,MOBILE COMPUTING,642.0,143062.52,180,44349.29,13,INTEL UHD GPU,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,27.353,0,1,0,0,0
4,23CD,2023-03-31,2023-04-13,BASKET SCONTI APRILE,ACEA11532C64E,269.9,349.9,2175.0,Principale,22.86,ACER,172229207.0,0.611496,0.489399,0.295215,0.322895,0.123499,MOBILE COMPUTING,393.0,87141.3,117,29500.73,13,INTEL UHD GPU,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,22.851667,0,1,0,0,0


FOCUS ON INTEL

high_quality_models = ['IRIS', 'ADRENO']                                                -->3

medium_quality_models = ['UHD', 'GRAPH. 6']                                             -->2

low_quality_models = ['HD', 'GRAPH. 5', 'GRAPH. 4', 'ADRENO506', 'MALI-G72']            -->1

In [18]:
pc['GPU_MODEL'][pc['GPU_MODEL'].str.contains('INTEL', na=False)].unique()

array(['INTEL UHD GPU', 'INTEL UHD GRAPH. 605', 'INTEL UHD GRAPH. 620',
       'INTEL HD GRAPH. 620', 'INTEL IRIS PLUS', 'INTEL IRIS XE',
       'INTEL UHD GRAPH. 600', 'INTEL IRIS PLUS 655',
       'INTEL HD GRAPH. 6000', 'INTEL UHD GRAPH. 617',
       'INTEL IRIS PLUS 645', 'INTEL HD GRAPH. 500',
       'INTEL HD GRAPH. 400', 'INTEL HD GRAPH. 510',
       'INTEL HD GRAPH. 610', 'INTEL ADRENO506', 'INTEL HD GRAPH. 505',
       'INTEL UHD GRAPH. 610', 'INTEL MALI-G72 MP3',
       'INTEL HD GRAPH. 615', 'INTEL HD GRAPH. 520', 'INTEL HD GPU'],
      dtype=object)

FOCUS ON APPLE

high_quality_models = ['16', '14']                                                -->3

medium_quality_models = ['10']                                             -->2

low_quality_models = ['8', '7']            -->1

In [19]:
pc['GPU_MODEL'][pc['GPU_MODEL'].str.contains('APPLE', na=False)].unique()

array(['APPLE 7CORE GPU', 'APPLE 8CORE GPU', 'APPLE 10COREGPU',
       'APPLE 16COREGPU', 'APPLE 14COREGPU'], dtype=object)

FOCUS ON RAD

high_quality_models = ['VEGA']                                                -->3

medium_quality_models = ['PRO', '610', 'R7', 'R5']                                             -->2

low_quality_models = ['R3', 'R2']            -->1

In [20]:
pc['GPU_MODEL'][pc['GPU_MODEL'].str.contains('RAD', na=False)].unique()

array(['RAD R5 APU', 'RAD 610M', 'RAD VEGA 8', 'RAD VEGA 3', 'RAD',
       'RAD PRO 5300M', 'RAD R4 APU', 'RAD R7 APU', 'RAD R5 M420',
       'RAD 520', 'RAD 530', 'RAD VEGA 6', 'RAD RX VEGA 10', 'RAD R3 APU',
       'RAD 535', 'RAD VEGA', 'RAD R2 APU'], dtype=object)

FOCUS ON GEF

high_quality_models = ['RTX']                                                -->3

medium_quality_models = ['GTX']                                             -->2

low_quality_models = ['MX']            -->1

In [21]:
pc['GPU_MODEL'][pc['GPU_MODEL'].str.contains('GEF', na=False)].unique()

array(['GEF MX130', 'GEF MX450', 'GEF GTX1650', 'GEF RTX3050',
       'GEF RTX2060', 'GEF RTX3060', 'GEF MX230', 'GEF MX110',
       'GEF MX330', 'GEF GTX1050TI', 'GEF GTX1050', 'GEF MX350',
       'GEF 930MX', 'GEF MX150', 'GEF GTX1650TI', 'GEF 940MX',
       'GEF MX250', 'GEF GTX1660TI', 'GEF GTX1660', 'GEF RTX2070 S.M',
       'GEF GTX1660TI M', 'GEF RTX3070', 'GEF RTX3050 TI',
       'GEF GTX1650MAXQ', 'GEF GTX1060', 'GEF RTX3070 TI', 'GEF RTX4050',
       'GEF RTX4060'], dtype=object)

In [22]:
# WE CREATE A NEW COLUMN 'QUALITY VERSION' THAT ASSIGN A VALUE BETWEEN 1 AND 3 BASED ON THE QUALITY OF THE GPU

QUALITY_VERSION = []
for i in range(pc.shape[0]):
    if 'INTEL' in pc.iloc[i, 23]:
        if 'IRIS' in pc.iloc[i, 23] or 'ADRENO' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(3)
        elif 'UHD' in pc.iloc[i, 23] or 'GRAPH. 6' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(2)
        else:
            QUALITY_VERSION.append(1)

    elif 'APPLE' in pc.iloc[i, 23]:
        if '16' in pc.iloc[i, 23] or '14' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(3)
        elif '10' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(2)
        else:
            QUALITY_VERSION.append(1)
        
    elif 'RAD' in pc.iloc[i, 23]:
        if 'VEGA' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(3)
        elif 'PRO' in pc.iloc[i, 23] or '610' in pc.iloc[i, 23] or 'R7' in pc.iloc[i, 23] or 'R5' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(2)
        else:
            QUALITY_VERSION.append(1)

    elif 'GEF' in pc.iloc[i, 23]:
        if 'RTX' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(3)
        elif 'GTX' in pc.iloc[i, 23]:
            QUALITY_VERSION.append(2)
        else:
            QUALITY_VERSION.append(1)
    
    else:  #SE IL VALORE è NOT DEFINED
         QUALITY_VERSION.append(1)
    

pc['QUALITY_VERSION'] = QUALITY_VERSION
    


In [23]:
# DROP GPU_MODEL COLUMN BEACAUSE IS ENCODED

pc.drop('GPU_MODEL', axis=1, inplace=True)

In [24]:
pc.head()

Unnamed: 0,CODICE_VOLANTINO,DATA_INIZIO,DATA_FINE,NOME_CAMPAGNA,ART_COD,PREZZO_PROMO,PREZZO_LISTINO,STOCK_PZ,TIPOLOGIA_PRODOTTO,SCONTO_PERC,BRAND,ITEM_ID,KPI_1,KPI_2,KPI_3,KPI_4,KPI_5,PRODUCT_GROUP,QTA,FATTURATO,QTA_storico,FATTURATO_storico,DURATA_VOLANTINO_IN_GIORNI,CARATTERISTICA_02,CARATTERISTICA_05,CARATTERISTICA_04,CARATTERISTICA_03,CARATTERISTICA_06,CARATTERISTICA_07,SCONTO_PERC_MEDIO_VOLANTINO,VERSION_NOT_DEFINED,VERSION_INTEL,VERSION_APPLE,VERSION_RAD,VERSION_GEF,QUALITY_VERSION
0,22GC,2022-07-11,2022-07-21,UN MARE DI SCONTI A TASSO ZERO,ACEA11532C64E,249.9,359.9,2028.0,Principale,31.0,ACER,172229207.0,0.656697,0.0,0.0,0.32269,0.0,MOBILE COMPUTING,188.0,38403.3,0,0.0,10,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,25.37037,0,1,0,0,0,2
1,23DB,2023-04-28,2023-05-11,PASSIONE CASA,ACEA11532C64E,269.9,349.9,1217.0,Principale,22.86,ACER,172229207.0,0.528941,0.489399,0.633822,0.322895,0.123499,MOBILE COMPUTING,203.0,44754.8,854,190473.36,13,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,25.475714,0,1,0,0,0,2
2,23LB,2023-10-16,2023-10-22,BASKET,ACEA11532C64E,269.9,349.9,1917.0,Principale,22.86,ACER,172229207.0,0.914982,0.547895,0.213179,0.207078,1.0,MOBILE COMPUTING,251.0,55181.62,119,27877.12,6,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,25.476111,0,1,0,0,0,2
3,23CB,2023-03-17,2023-03-30,BASKET MARZO,ACEA11532C64E,249.9,349.9,438.0,Principale,28.58,ACER,172229207.0,0.610293,0.443426,0.345346,0.230252,1.0,MOBILE COMPUTING,642.0,143062.52,180,44349.29,13,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,27.353,0,1,0,0,0,2
4,23CD,2023-03-31,2023-04-13,BASKET SCONTI APRILE,ACEA11532C64E,269.9,349.9,2175.0,Principale,22.86,ACER,172229207.0,0.611496,0.489399,0.295215,0.322895,0.123499,MOBILE COMPUTING,393.0,87141.3,117,29500.73,13,PROCESSOR CELERON N,OS VERSION WIN11 HOME S,STORAGE IN GB 128,RAM IN GB 4,DISPLAY SIZE 15.6,CONVERTIBLE NO,22.851667,0,1,0,0,0,2


## CAR_02

In [25]:
pc['CARATTERISTICA_02'].unique()

array(['PROCESSOR CELERON N', 'PROCESSOR A9-SERIES', 'PROCESSOR RYZEN 5',
       'PROCESSOR PENTIUM SILVER', 'PROCESSOR RYZEN 3',
       'PROCESSOR CORE I5', 'PROCESSOR CORE I3', 'PROCESSOR CORE I7',
       'PROCESSOR RYZEN 7', nan, 'PROCESSOR APPLE M-SERIES',
       'PROCESSOR A6-SERIES', 'PROCESSOR PENTIUM', 'PROCESSOR A12-SERIES',
       'PROCESSOR PENTIUM GOLD', 'PROCESSOR SNAPDRAGON',
       'PROCESSOR 3000 SERIES', 'PROCESSOR HELIO P', 'PROCESSOR ATHLON',
       'PROCESSOR E2-SERIES'], dtype=object)

In [26]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_02': 'PROCESSOR'})

In [27]:
# DELETE THE WORD 'PROCESSOR' AND FILL THE NULL VALUES WITH A NEW CLASS, CALLED 'NOT DEFINED'

pc['PROCESSOR'] = pc['PROCESSOR'].str.replace('PROCESSOR ', '')
pc['PROCESSOR'].fillna('NOT DEFINED', inplace=True)

WE PERFORMED ORDINAL ENCODING ORDING TO THE QUALITY OF THE PROCESSORS 

quality_high = ['CORE I7', 'RYZEN 7', 'CORE I5', 'RYZEN 5'] -->3

quality_medium = ['CORE I3', 'RYZEN 3', 'A9-SERIES', 'A12-SERIES', 'PENTIUM GOLD', 'PENTIUM SILVER', 'APPLE'] -->2

quality_low = others  -->1

In [28]:
QUALITY_PROCESSOR = []

for i in range(pc.shape[0]):
    if 'CORE I7' in pc.iloc[i,23] or 'RYZEN 7' in pc.iloc[i,23] or 'CORE I5' in pc.iloc[i,23] or 'RYZEN 5' in pc.iloc[i,23]:
        QUALITY_PROCESSOR.append(3)
    elif 'CORE I3' in pc.iloc[i,23] or 'RYZEN 3' in pc.iloc[i,23] or 'A9-SERIES' in pc.iloc[i,23] or 'A12-SERIES' in pc.iloc[i,23] or 'PENTIUM GOLD' in pc.iloc[i,23] or 'PENTIUM SILVER' in pc.iloc[i,23] or 'APPLE' in pc.iloc[i,23]:
        QUALITY_PROCESSOR.append(2)
    else:
        QUALITY_PROCESSOR.append(1)

pc['QUALITY_PROCESSOR'] = QUALITY_PROCESSOR

In [29]:
# DROP THE COLUMN 'PROCESSOR' BECAUSE WE CREATE THE COLUMN 'QUALITY_PROCESSOR'

pc.drop('PROCESSOR', axis=1, inplace=True)

## CAR_03

In [30]:
pc['CARATTERISTICA_03'].unique()

array(['RAM IN GB 4', 'RAM IN GB 8', 'RAM IN GB 16', nan, 'RAM IN GB 12',
       'RAM IN GB 32'], dtype=object)

In [31]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_03': 'RAM_GB'})

In [32]:
# TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'RAM IN GB 8' --> 8.00) AND THEN FILL NULL VALUES WITH THE MOST FREQUENT CLASS

pc['RAM_GB'] = pc['RAM_GB'].str.replace('RAM IN GB ', '').astype(float)
pc['RAM_GB'].fillna(pc['RAM_GB'].value_counts().index[0], inplace=True)

## CAR_04

In [33]:
pc['CARATTERISTICA_04'].unique()

array(['STORAGE IN GB 128', 'STORAGE IN GB 1000', 'STORAGE IN GB 512',
       'STORAGE IN GB 256', 'STORAGE IN GB 1256', 'STORAGE IN GB 64', nan,
       'STORAGE IN GB 500', 'STORAGE IN GB 1128', 'STORAGE IN GB 32',
       'STORAGE IN GB 2000', 'STORAGE IN GB 120'], dtype=object)

In [34]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_04': 'STORAGE_GB'})

In [35]:
# TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'STORAGE IN GB 512' --> 512.00) AND THEN FILL NULL VALUES WITH THE MOST FREQUENT CLASS

pc['STORAGE_GB'] = pc['STORAGE_GB'].str.replace('STORAGE IN GB ', '').astype(float)
pc['STORAGE_GB'].fillna(pc['STORAGE_GB'].value_counts().index[0], inplace=True)

## CAR_05

In [36]:
pc['CARATTERISTICA_05'].unique()

array(['OS VERSION WIN11 HOME S', 'OS VERSION WIN10 HOME',
       'OS VERSION WIN11 HOME', 'OS VERSION WIN10 HOME S',
       'OS VERSION CHROME OS', 'OS VERSION WIN11 S', nan,
       'OS VERSION WIN10 PRO', 'OS VERSION MACOS BIG SUR',
       'OS VERSION MACOS MONTEREY', 'OS VERSION MACOS VENTURA',
       'OS VERSION MACOS HIGH SIER', 'OS VERSION OS X YOSEMITE',
       'OS VERSION MACOS MOJAVE', 'OS VERSION OS X MOUNT.LION',
       'OS VERSION MACOS CATALINA', 'OS VERSION WIN10 S',
       'OS VERSION WIN11 PRO'], dtype=object)

In [37]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_05': 'OS_VERSION'})

In [38]:
# WITH THE FOLLOWING CODE WE TAKE JUST THE FIRST WORD AFTER 'OS VERSION ' (EX. 'OS VERSION MACOS MONTEREY' --> 'MACOS')

pc['OS_VERSION'] = pc['OS_VERSION'].str.split('OS VERSION ').str.get(1).str.split().str.get(0)

In [39]:
# FILL THE NULL VALUES WITH A NEW CLASS, CALLED 'NOT DEFINED'

pc['OS_VERSION'].fillna('NOT DEFINED', inplace=True)

## CAR_06

In [40]:
pc['CARATTERISTICA_06'].unique()

array(['DISPLAY SIZE 15.6', 'DISPLAY SIZE 14', nan, 'DISPLAY SIZE 13.3',
       'DISPLAY SIZE 13.6', 'DISPLAY SIZE 15.3', 'DISPLAY SIZE 16',
       'DISPLAY SIZE 13.5', 'DISPLAY SIZE 16.1', 'DISPLAY SIZE 17.3',
       'DISPLAY SIZE 13', 'DISPLAY SIZE 11.6', 'DISPLAY SIZE 14.5',
       'DISPLAY SIZE 10.1', 'DISPLAY SIZE 12.4', 'DISPLAY SIZE 12.3',
       'DISPLAY SIZE 10', 'DISPLAY SIZE 14.1', 'DISPLAY SIZE 13.9',
       'DISPLAY SIZE 16.2', 'DISPLAY SIZE 14.2'], dtype=object)

In [41]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_06': 'DISPLAY_SIZE'})

In [42]:
# TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'DISPLAY SIZE 16.1' --> 16.1) AND THEN FILL NULL VALUES WITH THE MOST FREQUENT CLASS

pc['DISPLAY_SIZE'] = pc['DISPLAY_SIZE'].str.replace('DISPLAY SIZE ', '').astype(float)
pc['DISPLAY_SIZE'].fillna(pc['DISPLAY_SIZE'].value_counts().index[0], inplace=True)

## CAR_07

In [43]:
pc['CARATTERISTICA_07'].unique()

array(['CONVERTIBLE NO', nan, 'CONVERTIBLE YES'], dtype=object)

In [44]:
# RENAME THE COLUMN

pc = pc.rename(columns={'CARATTERISTICA_07': 'CONVERTIBLE'})

In [45]:
# REPLACE 'CONVERTIBLE NO' AND NULL VALUES WITH 0 AND 'CONVERTIBLE YES' WITH 1

pc['CONVERTIBLE'].replace(np.nan, 0, inplace=True)
pc['CONVERTIBLE'].replace('CONVERTIBLE NO', 0, inplace=True)
pc['CONVERTIBLE'].replace('CONVERTIBLE YES', 1, inplace=True)

In [715]:
#pc.to_csv('/Users/vincenzocamerlengo/Desktop/Data Science in Action/Unieuro Dataset-20240305/dati_luiss_2/pc.csv')

# TV

In [46]:
# MERGE 'TV' AND 'gfk' BY 'ITEM_ID' AND CREATE A NEW COLUMN 'SCONTO_PERC_MEDIO_VOLANTINO'

tv = df1[df1['PRODUCT_GROUP']=='PTV/FLAT']
tv = pd.merge(tv, gfk, on='ITEM_ID', how='left')
tv['SCONTO_PERC_MEDIO_VOLANTINO'] = tv.groupby('CODICE_VOLANTINO')['SCONTO_PERC'].transform('mean')

## CAR_01

In [47]:
tv['CARATTERISTICA_01'].unique()

array(['DISPLAY SIZE 24', 'DISPLAY SIZE 23.6', 'DISPLAY SIZE 28',
       'DISPLAY SIZE 32', 'DISPLAY SIZE 43', 'DISPLAY SIZE 48',
       'DISPLAY SIZE 49', 'DISPLAY SIZE 50', 'DISPLAY SIZE 55', nan,
       'DISPLAY SIZE 60', 'DISPLAY SIZE 65', 'DISPLAY SIZE 70',
       'DISPLAY SIZE 75', 'DISPLAY SIZE 77', 'DISPLAY SIZE 31.5',
       'DISPLAY SIZE 40', 'DISPLAY SIZE 85', 'DISPLAY SIZE 39',
       'DISPLAY SIZE 42', 'DISPLAY SIZE 58', 'DISPLAY SIZE 27.5',
       'DISPLAY SIZE 54.6', 'DISPLAY SIZE 22', 'DISPLAY SIZE 42.5',
       'DISPLAY SIZE 64.5'], dtype=object)

In [48]:
# RENAME THE COLUMN AND TAKE JUST THE FLOAT PART OF THE VALUES (EX. 'DISPLAY SIZE 65' --> 65.00)

tv = tv.rename(columns={'CARATTERISTICA_01': 'DISPLAY_SIZE'})
tv['DISPLAY_SIZE'] = tv['DISPLAY_SIZE'].str.replace('DISPLAY SIZE ', '').astype(float)

In [49]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

tv['DISPLAY_SIZE'].fillna(tv['DISPLAY_SIZE'].value_counts().index[0], inplace=True)

## CAR_02

In [50]:
tv['CARATTERISTICA_02'].unique()

array(['HD TYPES HD READY', 'HD TYPES FULL HD',
       'HD TYPES ULTRA HD/4K (3840X2160P)', nan,
       'HD TYPES ULTRA HD/8K (7680X4320P)'], dtype=object)

In [51]:
# RENAME THE COLUMN 

tv = tv.rename(columns={'CARATTERISTICA_02': 'HD_QUALITY'})

In [52]:
# PERFORMED AN ORDINAL ENCODING ACCORDING TO THE HD QUALITY

tv['HD_QUALITY'].replace('HD TYPES HD READY', 1, inplace=True)
tv['HD_QUALITY'].replace('HD TYPES FULL HD', 2, inplace=True)
tv['HD_QUALITY'].replace('HD TYPES ULTRA HD/4K (3840X2160P)', 3, inplace=True)
tv['HD_QUALITY'].replace('HD TYPES ULTRA HD/8K (7680X4320P)', 4, inplace=True)

In [53]:
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

tv['HD_QUALITY'].fillna(tv['HD_QUALITY'].value_counts().index[0], inplace=True)

## CAR_03

In [54]:
tv['CARATTERISTICA_03'].unique()

array(['MFM MFM', 'MFM NO MFM', nan], dtype=object)

In [55]:
# RENAME THE COLUMN 

tv = tv.rename(columns={'CARATTERISTICA_03': 'MFM'})

In [56]:
# REPLACE 'MFM NO MFM' AND NULL VALUES WITH 0 AND 'MFM MFM' WITH 1

tv['MFM'].replace('MFM MFM', 1, inplace=True)
tv['MFM'].replace('MFM NO MFM', 0, inplace=True)
tv['MFM'].fillna(0, inplace=True)

## CAR_04

In [60]:
tv['CARATTERISTICA_04'].unique()

array(['SMART TV SMART TV', 'CURVED NOT CURVED', 'SMART TV NO SMART TV',
       nan, 'CURVED CURVED'], dtype=object)

In [61]:
tv['CARATTERISTICA_04'].value_counts()

CARATTERISTICA_04
SMART TV SMART TV       2535
SMART TV NO SMART TV     204
CURVED NOT CURVED        156
CURVED CURVED             11
Name: count, dtype: int64

The rows were is present 'CURVED CURVED' or 'CURVED NOT CURVED', have the information abuot 'smart' on the next column.

Since the information about 'curve' is available just for a few observation, we decided to don't care about it.

So, for these rows we moved all the values on the previous column. Similarly to what we did for 'CARATTERISTICA_03' of smartphone.

In [62]:
selezione = (tv['CARATTERISTICA_04'] == 'CURVED CURVED') | (tv['CARATTERISTICA_04'] == 'CURVED NOT CURVED')
tv.loc[selezione, 'CARATTERISTICA_04'] = tv.loc[selezione, 'CARATTERISTICA_05']
tv.loc[selezione, 'CARATTERISTICA_05'] = tv.loc[selezione, 'CARATTERISTICA_06']
tv['CARATTERISTICA_04'] = tv['CARATTERISTICA_04'].fillna('')
righe_errate = tv[tv['CARATTERISTICA_04'].str.contains('CURVED')]
tv.loc[righe_errate.index, 'CARATTERISTICA_04'] = tv.loc[righe_errate.index, 'CARATTERISTICA_05']
tv['CARATTERISTICA_05'].replace(r'.*SMART.*', np.nan, regex=True, inplace=True)
tv.loc[righe_errate.index, 'CARATTERISTICA_05'] = tv.loc[righe_errate.index, 'CARATTERISTICA_06']
tv['CARATTERISTICA_06'].replace(r'.*DISPLAY.*', np.nan, regex=True, inplace=True)
tv['CARATTERISTICA_04'].replace('', 0, inplace=True)

In [63]:
# RENAME THE COLUMN 

tv = tv.rename(columns={'CARATTERISTICA_04': 'SMART'})

In [64]:
# REPLACE 'SMART TV NO SMART TV' WITH 0 AND 'SMART TV SMART TV' WITH 1

tv['SMART'].replace('SMART TV SMART TV', 1, inplace=True)
tv['SMART'].replace('SMART TV NO SMART TV', 0, inplace=True)

## CAR_05

In [65]:
tv['CARATTERISTICA_05'].unique()

array(['DISPLAY TECHNOL LCD-DISPLAY', nan, 'DISPLAY TECHNOL OLED-DISPLAY'],
      dtype=object)

In [66]:
# RENAME THE COLUMN 

tv = tv.rename(columns={'CARATTERISTICA_05': 'OLED'})

In [67]:
# REPLACE 'DISPLAY TECHNOL LCD-DISPLAY' WITH 0 AND 'DISPLAY TECHNOL OLED-DISPLAY' WITH 1
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

tv['OLED'].replace('DISPLAY TECHNOL LCD-DISPLAY', 0, inplace=True)
tv['OLED'].replace('DISPLAY TECHNOL OLED-DISPLAY', 1, inplace=True)
tv['OLED'].fillna(tv['OLED'].value_counts().index[0], inplace=True)

## CAR_06

In [68]:
tv['CARATTERISTICA_06'].unique()

array(['WCG NO WCG', nan, 'WCG VARIOUS WCG TEC', 'WCG QUANTUM DOT'],
      dtype=object)

In [69]:
# RENAME THE COLUMN 

tv = tv.rename(columns={'CARATTERISTICA_06': 'WCG'})

In [70]:
# PERFORMED AND ORDINAL ENCODING REGARDING THE "Wide Color Gamut". 
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

tv['WCG'].replace('WCG NO WCG', 0, inplace=True)
tv['WCG'].replace('WCG VARIOUS WCG TEC', 1, inplace=True)
tv['WCG'].replace('WCG QUANTUM DOT', 2, inplace=True)
tv['WCG'].fillna(tv['WCG'].value_counts().index[0], inplace=True)

## CAR_07

In [71]:
tv['CARATTERISTICA_07'].unique()

array(['MINILED BACKLIT NO MINILED', nan, 'MINILED BACKLIT MINILED'],
      dtype=object)

In [72]:
# RENAME THE COLUMN 

tv = tv.rename(columns={'CARATTERISTICA_07': 'MINILED'})

In [73]:
# REPLACE 'MINILED BACKLIT NO MINILED' WITH 0 AND 'MINILED BACKLIT MINILED' WITH 1
# FILL NULL VALUES WITH THE MOST FREQUENT CLASS

tv['MINILED'].replace('MINILED BACKLIT NO MINILED', 0, inplace=True)
tv['MINILED'].replace('MINILED BACKLIT MINILED', 1, inplace=True)
tv['MINILED'].fillna(tv['MINILED'].value_counts().index[0], inplace=True)

In [587]:
#tv.to_csv('/Users/vincenzocamerlengo/Desktop/Data Science in Action/Unieuro Dataset-20240305/dati_luiss_2/tv.csv')