# Importaciones

In [39]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as bs
from googlesearch import search
import time
import random


In [40]:
df_train_a = pd.read_csv('train_a.csv')


# Observación inicial

In [41]:
pd.set_option('display.max_columns', 500)

df_train_a.head()


Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,597,Asus,TP501UA-CJ131T (i5-7200U/8GB/1TB/W10),2 in 1 Convertible,15.6,Touchscreen 1366x768,Intel Core i5 7200U 2.5GHz,8GB,1TB HDD,Intel HD Graphics 520,Windows 10,2.2kg,739.0
1,1173,HP,15-bw002nv (A6-9220/4GB/256GB/Radeon,Notebook,15.6,Full HD 1920x1080,AMD A6-Series A6-9220 2.5GHz,4GB,256GB SSD,AMD Radeon 520,Windows 10,1.91kg,478.89
2,1274,Asus,Rog G752VT-GC073T,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,16GB,128GB SSD + 1TB HDD,Nvidia GeForce GTX 970M,Windows 10,4.0kg,1900.0
3,252,Asus,Rog G701VIK-BA060T,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7820HK 2.9GHz,16GB,256GB SSD,Nvidia GeForce GTX 1080,Windows 10,3.6kg,2999.0
4,15,Apple,"MacBook 12""",Ultrabook,12.0,IPS Panel Retina Display 2304x1440,Intel Core M m3 1.2GHz,8GB,256GB SSD,Intel HD Graphics 615,macOS,0.92kg,1262.4


In [42]:
df_train_a.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB


# Limpieza

## Columna 0: laptop_ID

In [43]:
# No nos interesa convertir esta a dummies, así que la ignoramos
# Nos será útil si luego hay que localizar algun producto en concreto
df_train_a['laptop_ID'].unique()


array([ 597, 1173, 1274,  252,   15,  719, 1214,  601,  303,  446, 1083,
       1308, 1177,  354, 1039,  260,  520,  444,  724,  800,  108,  620,
        499,  869,  268, 1236,  209,  767,   28,  752,  332,   64,  776,
         73,  862,  666,  356,  910,  588,  671, 1051, 1222,  227,  317,
        194,  525, 1283,  785,  780,  518,  393,  876,   84,  500,  494,
       1212,  256,  405,  947,  378,  919,   48, 1255,  674,  249,  578,
        971, 1161,  343, 1239,  879,  515,  441, 1227,  548,   98,  681,
       1104,  797,  803,  881, 1045,  649,  535,  963,  536,  152,  210,
       1249,  678,  676,  410,  246, 1303,  346,  783,  150, 1029, 1209,
        922,  837,  961,  575,  611,  636,  433, 1250,  348,  807,  434,
       1223, 1003,  528,  413,  484,  297,  917, 1038, 1099, 1112,  662,
        874,  801,  854,  931,  813,  938,  144,  951,  341, 1124,  607,
        633,   61, 1125, 1320,  918,  836,  143,  262,  426,  380,  549,
        658,  960,   19,  651,  181,  275,  170,  7

## Columna 1: Company

In [44]:
df_train_a['Company'].unique()


array(['Asus', 'HP', 'Apple', 'Dell', 'Lenovo', 'Google', 'Vero', 'Acer',
       'Toshiba', 'MSI', 'Samsung', 'Razer', 'Microsoft', 'LG',
       'Mediacom', 'Fujitsu', 'Xiaomi', 'Chuwi'], dtype=object)

In [45]:
# REPASAR EN MÚLTIPLES COLUMNAS
# Distingimos las empresas en función de su país de origen
# Los países donde la tecnología es más barata, de los que aparecen en el dataset, son los siguientes: Taiwán < Korea < China < Reino Unido < Estados Unidos < Japón
# Fuente: https://www.linio.com.mx/sp/technology-price-index-2016
# Así, los numeramos de más barato a más caro
countries = {0: ['Asus', 'Acer', 'Vero'], 
            1: ['Samsung', 'LG'],
            2: ['Lenovo', 'Xiaomi', 'Chuwi'],
            3: ['Mediacom'],
            4: ['HP', 'Apple', 'Dell', 'Google', 'Razer', 'Microsoft'],
            5: ['Toshiba', 'MSI', 'Fujitsu']
            }

for country, brands in countries.items():
    for brand in brands:
        df_train_a.loc[df_train_a['Company'] == brand, 'Company'] = country

df_train_a['Company'] = pd.to_numeric(df_train_a['Company'])


In [46]:
df_train_a['Company'].unique()


array([0, 4, 2, 5, 1, 3], dtype=int64)

## Columna 2: Product

In [47]:
# Borramos esta columna, ya que todos los valores son únicos, y no podemos expresarla en dummies y sacarle partido
df_train_a['Product'].unique()


array(['TP501UA-CJ131T (i5-7200U/8GB/1TB/W10)',
       '15-bw002nv (A6-9220/4GB/256GB/Radeon', 'Rog G752VT-GC073T',
       'Rog G701VIK-BA060T', 'MacBook 12"', 'Latitude 5480',
       'V110-15ISK (i3-6006U/4GB/1TB/Radeon',
       'V320-17ISK (i3-6006U/4GB/500GB/FHD/No', 'IdeaPad 310-15ABR',
       'VivoBook Max', 'Alienware 15', 'Inspiron 3567', 'Spectre Pro',
       'VivoBook Flip', 'Elitebook 820', 'Vostro 5370', 'Inspiron 5570',
       'Pixelbook (Core', 'Latitude 5580', 'K146 (N3350/4GB/32GB/W10)',
       'Pavilion 14-BK001nv', 'VivoBook Pro', 'Inspiron 5368',
       'Vostro 3568', 'IdeaPad Y700-15ISK', 'Legion Y520-15IKBN',
       'Alienware 17', 'Thinkpad P51s', 'Aspire E5-575',
       'UX410UA-GV350T (i5-8250U/8GB/256GB/FHD/W10)', 'Inspiron 7559',
       'Inspiron 5370', 'Rog GL502VM-DS74', 'XPS 15',
       '15-ay047nv (i3-6006U/6GB/1TB/Radeon', 'Tecra Z40-C-136',
       'FX502VM-DM105T (i7-6700HQ/8GB/1TB/GeForce', 'Portege Z30-C-16L',
       'GE63VR 7RF', 'Thinkpad Yoga', 'Prob

In [48]:
df_train_a = df_train_a.drop(columns = 'Product')


## Columna 3: TypeName

In [49]:
df_train_a['TypeName'].unique()


array(['2 in 1 Convertible', 'Notebook', 'Gaming', 'Ultrabook',
       'Workstation', 'Netbook'], dtype=object)

In [50]:
# Los clasificamos en función del coste general de cada categoría
df_train_a['TypeName'] = df_train_a['TypeName'].apply(lambda x: 0 if x == 'Netbook' else 1 if x == 'Notebook' else 2 if x == '2 in 1 Convertible' else 3 if x == 'Ultrabook' else 4 if x == 'Workstation' else 5)


In [51]:
df_train_a['TypeName'].unique()


array([2, 1, 5, 3, 4, 0], dtype=int64)

## Columna 4: Inches

In [52]:
# Ya es numérica, así que la dejamos igual
df_train_a['Inches'].unique()


array([15.6, 17.3, 12. , 14. , 13.3, 11.6, 12.5, 12.3, 15. , 13.9, 17. ,
       13.5, 18.4, 10.1, 14.1, 15.4])

## Columna 5: ScreenResolution

In [53]:
df_train_a['ScreenResolution'].unique()


array(['Touchscreen 1366x768', 'Full HD 1920x1080',
       'IPS Panel Full HD 1920x1080',
       'IPS Panel Retina Display 2304x1440', '1366x768',
       '4K Ultra HD 3840x2160', 'Touchscreen 2560x1440',
       'Touchscreen 2400x1600', '1920x1080',
       'Full HD / Touchscreen 1920x1080',
       'IPS Panel 4K Ultra HD 3840x2160',
       'IPS Panel Touchscreen / 4K Ultra HD 3840x2160',
       '4K Ultra HD / Touchscreen 3840x2160',
       'IPS Panel Full HD / Touchscreen 1920x1080',
       'IPS Panel Quad HD+ / Touchscreen 3200x1800', '1600x900',
       'Touchscreen 2256x1504', 'IPS Panel Touchscreen 2400x1600',
       'IPS Panel 1366x768', 'Quad HD+ / Touchscreen 3200x1800',
       'IPS Panel Quad HD+ 3200x1800',
       'IPS Panel Retina Display 2560x1600',
       'IPS Panel Touchscreen 1920x1200',
       'IPS Panel 4K Ultra HD / Touchscreen 3840x2160',
       'IPS Panel 2560x1440', '1440x900',
       'IPS Panel Touchscreen 1366x768', 'Quad HD+ 3200x1800',
       'Touchscreen / Quad HD

In [54]:
def dummie_col_conditioned(df, initial_col, word):
    '''Creates an aditional numeric column (0 and 1) based on the existence of the provided value in the chosen column'''
    '''Requires a dataframe, a column name (string) and a word (string)'''
    df[word] = df[initial_col].copy().apply(lambda x: 1 if word in x else 0)


In [55]:
# Los dividimos entre los que tienen pantalla táctil y los que no
# Hacemos lo propio con los que tienen IPS Panel y los de Retina Display
for characteristic in ('Touchscreen', 'IPS Panel', 'Retina Display'):
    dummie_col_conditioned(df_train_a, 'ScreenResolution', characteristic)


In [56]:
df_train_a['Touchscreen'].unique()


array([1, 0], dtype=int64)

In [57]:
df_train_a['IPS Panel'].unique()


array([0, 1], dtype=int64)

In [58]:
df_train_a['Retina Display'].unique()


array([0, 1], dtype=int64)

In [59]:
def normalize_res(number):
    '''Assigns a bigger number to a string according to how high is the resolution it contains'''
    '''Requires a string'''
    number = re.search(r'(\w+$)', number)
    number = list(map(int, number.group().split('x')))
    if number[0] < 1920 or number[1] < 1080:
        number = 0
    elif number[0] == 1920 and number[1] == 1080:
        number = 1
    elif number[0] < 2560 or number[1] < 1440:
        number = 2
    elif number[0] < 3840 or number[1] < 2160:
        number = 3
    else:
        number = 4
    return number


In [60]:
# Aplicamos una función que nos clasifica las resoluciones numéricamente en función de su nivel (No HD < HD < QHD < QHD+ < UHD)
df_train_a['ScreenResolution'] = df_train_a['ScreenResolution'].apply(lambda x: normalize_res(x))


In [61]:
df_train_a['ScreenResolution'].unique()


array([0, 1, 2, 4, 3], dtype=int64)

## Columna 6: Cpu

In [62]:
df_train_a['Cpu'].unique()


array(['Intel Core i5 7200U 2.5GHz', 'AMD A6-Series A6-9220 2.5GHz',
       'Intel Core i7 6700HQ 2.6GHz', 'Intel Core i7 7820HK 2.9GHz',
       'Intel Core M m3 1.2GHz', 'Intel Core i5 7440HQ 2.8GHz',
       'Intel Core i3 6006U 2GHz', 'AMD A10-Series 9600P 2.4GHz',
       'Intel Core i3 7100U 2.4GHz', 'Intel Core i7 6600U 2.6GHz',
       'Intel Celeron Dual Core N3350 1.1GHz',
       'Intel Core i5 8250U 1.6GHz', 'Intel Core i7 8550U 1.8GHz',
       'Intel Core i7 7Y75 1.3GHz', 'Intel Core i5 7300U 2.6GHz',
       'Intel Core i3 6006U 2.0GHz', 'Intel Core i7 7700HQ 2.8GHz',
       'Intel Core i5 6200U 2.3GHz', 'Intel Core i7 7600U 2.8GHz',
       'Intel Core i7 6500U 2.5GHz', 'Intel Core i7 7500U 2.7GHz',
       'Intel Core i7 6820HQ 2.7GHz', 'Intel Core i5 6440HQ 2.6GHz',
       'AMD A10-Series 9620P 2.5GHz', 'Intel Core i7 6560U 2.2GHz',
       'AMD Ryzen 1700 3GHz', 'Intel Celeron Dual Core 3205U 1.5GHz',
       'Intel Pentium Quad Core N3710 1.6GHz',
       'Intel Pentium Quad Co

In [63]:
# Limpiamos para dejar solo los GHz
df_train_a['Cpu'] = df_train_a['Cpu'].str.split()
df_train_a['Cpu'] = df_train_a['Cpu'].apply(lambda x: x[-1][:-3])

df_train_a['Cpu'] = pd.to_numeric(df_train_a['Cpu'])


In [64]:
df_train_a['Cpu'].unique()


array([2.5 , 2.6 , 2.9 , 1.2 , 2.8 , 2.  , 2.4 , 1.1 , 1.6 , 1.8 , 1.3 ,
       2.3 , 2.7 , 2.2 , 3.  , 1.5 , 1.44, 2.1 , 3.6 , 3.1 , 1.9 , 0.9 ,
       1.  , 1.92])

## Columna 7: Ram

In [65]:
df_train_a['Ram'].unique()


array(['8GB', '4GB', '16GB', '6GB', '2GB', '32GB', '12GB', '24GB', '64GB'],
      dtype=object)

In [66]:
# Lo convertimos a números
df_train_a['Ram'] = pd.to_numeric(df_train_a['Ram'].map(lambda x: x.strip('GB')))


In [67]:
df_train_a['Ram'].unique()


array([ 8,  4, 16,  6,  2, 32, 12, 24, 64], dtype=int64)

## Columna 8: Memory

In [68]:
df_train_a['Memory'].unique()


array(['1TB HDD', '256GB SSD', '128GB SSD +  1TB HDD', '500GB HDD',
       '256GB SSD +  1TB HDD', '512GB SSD', '32GB Flash Storage',
       '256GB SSD +  2TB HDD', '1TB SSD', '16GB Flash Storage', '2TB HDD',
       '128GB SSD', '512GB Flash Storage', '180GB SSD',
       '512GB SSD +  1TB HDD', '64GB Flash Storage',
       '128GB Flash Storage', '1.0TB Hybrid', '32GB SSD',
       '128GB SSD +  2TB HDD', '256GB Flash Storage',
       '64GB Flash Storage +  1TB HDD', '1TB HDD +  1TB HDD',
       '256GB SSD +  500GB HDD', '16GB SSD', '512GB SSD +  256GB SSD',
       '508GB Hybrid', '32GB HDD', '1.0TB HDD', '512GB SSD +  512GB SSD',
       '256GB SSD +  1.0TB Hybrid', '256GB SSD +  256GB SSD',
       '512GB SSD +  2TB HDD', '128GB HDD', '8GB SSD', '240GB SSD'],
      dtype=object)

In [69]:
columns = []
types = ['Flash Storage', 'SSD', 'HDD', 'Hybrid']
for type in types:
    column = type + ' memory (GB)'
    columns.append(column)
    searching = rf'([0-9]+\.?[0-9]?[A-Z]+)\s{type}'
    df_train_a[column] = df_train_a['Memory'].apply(lambda x: re.search(searching, x).groups()[0] if type in x else '0GB')

df_train_a = df_train_a.drop(columns = 'Memory')


In [70]:
for column in columns:
    df_train_a[column] = df_train_a[column].apply(lambda x: int(float(x[:-2])) if 'GB' in x else int(float(x[:-2]) * 1000))


In [71]:
df_train_a['SSD memory (GB)'].unique()


array([   0,  256,  128,  512, 1000,  180,   32,   16,    8,  240],
      dtype=int64)

In [72]:
df_train_a['HDD memory (GB)'].unique()


array([1000,    0,  500, 2000,   32,  128], dtype=int64)

In [73]:
df_train_a['Hybrid memory (GB)'].unique()


array([   0, 1000,  508], dtype=int64)

In [74]:
df_train_a['Flash Storage memory (GB)'].unique()


array([  0,  32,  16, 512,  64, 128, 256], dtype=int64)

## Columna 9: Gpu

In [75]:
df_train_a['Gpu'].unique()


array(['Intel HD Graphics 520', 'AMD Radeon 520',
       'Nvidia GeForce GTX 970M', 'Nvidia GeForce GTX 1080',
       'Intel HD Graphics 615', 'Nvidia GeForce 930MX',
       'AMD Radeon R5 M430', 'AMD Radeon R5 430', 'Intel HD Graphics 620',
       'Intel HD Graphics 500', 'Intel UHD Graphics 620',
       'AMD Radeon 530', 'Nvidia GeForce 940MX',
       'Nvidia GeForce GTX 1050', 'Nvidia GeForce GTX 960<U+039C>',
       'Nvidia GeForce GTX 1060', 'Nvidia GeForce GTX 1070',
       'Nvidia Quadro M520M', 'Nvidia GeForce GTX 960M',
       'Nvidia GeForce GTX 960', 'AMD Radeon R5 M420X',
       'Nvidia Quadro M2000M', 'AMD Radeon 540', 'AMD FirePro W5130M',
       'AMD Radeon RX 540', 'Nvidia GeForce GTX 980M',
       'Nvidia Quadro M1200', 'Intel Iris Graphics 540',
       'AMD Radeon RX 580', 'Intel HD Graphics', 'Intel HD Graphics 405',
       'Intel HD Graphics 505', 'Nvidia Quadro M1000M',
       'Nvidia Quadro M2200M', 'AMD Radeon R2 Graphics', 'AMD Radeon R5',
       'Intel HD Graph

In [76]:
# Para clasificar una gpu en función de su potencia, podemos hacer un request a https://www.notebookcheck.org/
# Sin embargo, si queremos sacar la potencia de todas, el buscador de la propia página no vale para encontrarlas. Además, se reparten por dos dominios distintos (.org y .net)
# Por tanto, recurrimos a Google. El proceso es lento, ya que, para evitar que nos detecte como un robot, hay que simular un comportamiento humano
for gpu in df_train_a['Gpu'].unique():
    print(f'String to search in Google: {gpu}')
    try:
        for url in search(gpu + ' ' + 'notebookcheck.org', num_results=7):
            if 'https://www.notebookcheck.org' in url or 'https://www.notebookcheck.net' in url:
                print(f'Searching in url: {url}')
                try:
                    r = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)\
                    AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'})
                    soup = bs(r.text, 'lxml')
                    title = '>' + ' '.join(soup.find(class_="tx-nbc2fe-pi1")('h1')[0].text.split()[1:]) + '<'
                    title = ' '.join(['Mobile<' if word == '(Desktop)<' else word for word in title.split()])
                    title = ''.join(['\\' + character if character == '(' or character == ')' else character for character in title])
                    print(f'String to search in website: {title}')
                    for line in soup.find(class_='gpu-even')('tr'):
                        correct_gpu = re.search(title, str(line))
                        if correct_gpu:
                            print(f'Line to search: {line}')
                            print(f'Match: {correct_gpu}')
                            ghz_found = re.search(r'([0-9]?\.?[0-9]?[0-9]?\s-?\s?[0-9]+\.?[0-9]?[0-9]?)\sGHz', str(line))
                            ghz = ghz_found.groups()[0].split()[0]
                            print(f'GHz: {ghz}')
                            df_train_a.loc[df_train_a['Gpu'] == gpu, 'Gpu'] = float(ghz)
                    break
                except Exception:
                    continue
        time.sleep(random.randint(60, 120))
    except Exception as e:
        print(e)
        break
    print('\n')


String to search in Google: Intel HD Graphics 520
Searching in url: https://www.notebookcheck.org/Intel-HD-Graphics-520.156171.0.html
String to search in website: >HD Graphics 520<
Line to search: <tr><td style="font-weight:bold;">HD Graphics 520</td><td style="color:#666;"> 24  @ 0.3 - 1.05 GHz</td><td style="color:#666;">64/128 Bit</td></tr>
Match: <re.Match object; span=(33, 50), match='>HD Graphics 520<'>
GHz: 0.3


String to search in Google: AMD Radeon 520
Searching in url: https://www.notebookcheck.org/AMD-Radeon-520.278998.0.html
String to search in website: >Radeon 520<
Line to search: <tr><td style="font-weight:bold;">Radeon 520</td><td style="color:#666;"> 320  @ 1.03 GHz</td><td style="color:#666;">64 Bit @ 2250 MHz</td></tr>
Match: <re.Match object; span=(33, 45), match='>Radeon 520<'>
GHz: 1.03


String to search in Google: Nvidia GeForce GTX 970M
Searching in url: https://www.notebookcheck.org/NVIDIA-GeForce-GTX-970M.131114.0.html
String to search in website: >GeForce GT

In [78]:
df_train_a['Gpu'].unique()


array([0.3, 1.03, 0.92, 1.57, 0.95, 0.96, 1.02, 1.12, 1.35, 1.13, 1.4,
       1.44, 0.76, 1.1, 0.78, 1.04, 1.22, 0.93, 0.99, 'AMD Radeon RX 580',
       0.35, 0.32, 0.69, 0.8, 0.1, 1.47, 0.7, 0.97, 1.49, 1.29, 0.98,
       1.07, 1.05, 0.9, 0.83, 0.91, 0.72, 0.45, 0.86, 0.2,
       'AMD Radeon R7 M465'], dtype=object)

In [82]:
# Hay dos valores que no se han encontrado. Los cambiamos a mano
for tupla in [('AMD Radeon R7 M465', 0.83), ('AMD Radeon RX 580', 1)]:
    df_train_a.loc[df_train_a['Gpu'] == tupla[0], 'Gpu'] = tupla[1]

df_train_a['Gpu'] = pd.to_numeric(df_train_a['Gpu'])


In [83]:
df_train_a['Gpu'].unique()


array([0.3 , 1.03, 0.92, 1.57, 0.95, 0.96, 1.02, 1.12, 1.35, 1.13, 1.4 ,
       1.44, 0.76, 1.1 , 0.78, 1.04, 1.22, 0.93, 0.99, 1.  , 0.35, 0.32,
       0.69, 0.8 , 0.1 , 1.47, 0.7 , 0.97, 1.49, 1.29, 0.98, 1.07, 1.05,
       0.9 , 0.83, 0.91, 0.72, 0.45, 0.86, 0.2 ])

## Columna 9: OpSys

In [84]:
opsys_columns = df_train_a['OpSys'].unique()
opsys_columns


array(['Windows 10', 'macOS', 'Linux', 'No OS', 'Chrome OS', 'Windows 7',
       'Mac OS X', 'Windows 10 S', 'Android'], dtype=object)

In [85]:
df_train_a = pd.concat([df_train_a, pd.get_dummies(df_train_a['OpSys'])], axis=1)

df_train_a = df_train_a.drop(columns = 'OpSys')


In [86]:
for column in opsys_columns:
    print(df_train_a[column].unique())


[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]


## Columna 10: Weight

In [87]:
df_train_a['Weight'].unique()


array(['2.2kg', '1.91kg', '4.0kg', '3.6kg', '0.92kg', '1.64kg', '1.9kg',
       '2.8kg', '2.4kg', '2kg', '3.21kg', '2.3kg', '1.48kg', '1.5kg',
       '1.26kg', '1.41kg', '1.1kg', '1.22kg', '1.58kg', '2.25kg',
       '1.99kg', '1.62kg', '2.18kg', '2.6kg', '4.42kg', '2.5kg', '2.23kg',
       '1.4kg', '2.72kg', '2.1kg', '3.31kg', '2.06kg', '2.04kg', '1.47kg',
       '1.2kg', '1.42kg', '1.71kg', '2.07kg', '1.63kg', '1.6kg', '4.3kg',
       '1.78kg', '1.3kg', '3.25kg', '2.20kg', '3.2kg', '2.24kg', '1.38kg',
       '1.94kg', '1.45kg', '2.59kg', '1.84kg', '1.56kg', '1.34kg',
       '4.2kg', '0.920kg', '2.38kg', '1.68kg', '1.95kg', '1.252kg',
       '2.0kg', '1.15kg', '0.98kg', '1.35kg', '2.02kg', '2.36kg',
       '1.32kg', '1.7kg', '2.54kg', '1.08kg', '2.09kg', '1.86kg',
       '1.37kg', '1.54kg', '2.7kg', '1.75kg', '1.16kg', '1.76kg', '3kg',
       '2.9kg', '3.8kg', '1.36kg', '4.4kg', '1.87kg', '1.83kg', '0.69kg',
       '1.29kg', '4.14kg', '1.23kg', '2.65kg', '2.05kg', '2.08kg',
       '2.5

In [88]:
# Convertimos la string a numérica
df_train_a['Weight'] = pd.to_numeric(df_train_a['Weight'].map(lambda x: x.strip('kg')))


In [89]:
df_train_a['Weight'].unique()


array([2.2  , 1.91 , 4.   , 3.6  , 0.92 , 1.64 , 1.9  , 2.8  , 2.4  ,
       2.   , 3.21 , 2.3  , 1.48 , 1.5  , 1.26 , 1.41 , 1.1  , 1.22 ,
       1.58 , 2.25 , 1.99 , 1.62 , 2.18 , 2.6  , 4.42 , 2.5  , 2.23 ,
       1.4  , 2.72 , 2.1  , 3.31 , 2.06 , 2.04 , 1.47 , 1.2  , 1.42 ,
       1.71 , 2.07 , 1.63 , 1.6  , 4.3  , 1.78 , 1.3  , 3.25 , 3.2  ,
       2.24 , 1.38 , 1.94 , 1.45 , 2.59 , 1.84 , 1.56 , 1.34 , 4.2  ,
       2.38 , 1.68 , 1.95 , 1.252, 1.15 , 0.98 , 1.35 , 2.02 , 2.36 ,
       1.32 , 1.7  , 2.54 , 1.08 , 2.09 , 1.86 , 1.37 , 1.54 , 2.7  ,
       1.75 , 1.16 , 1.76 , 3.   , 2.9  , 3.8  , 1.36 , 4.4  , 1.87 ,
       1.83 , 0.69 , 1.29 , 4.14 , 1.23 , 2.65 , 2.05 , 2.08 , 2.56 ,
       2.94 , 2.71 , 3.35 , 1.85 , 1.44 , 2.16 , 2.63 , 2.77 , 2.31 ,
       1.43 , 1.25 , 1.65 , 1.21 , 1.05 , 2.19 , 1.28 , 1.49 , 1.98 ,
       1.93 , 2.29 , 1.11 , 1.13 , 1.24 , 2.32 , 1.09 , 1.8  , 3.4  ,
       2.62 , 1.88 , 2.43 , 1.17 , 4.33 , 2.34 , 1.31 , 1.27 , 2.17 ,
       1.39 , 2.14 ,

## Columna 11: Price_euros

In [90]:
# No hace falta cambiar nada, pues ya es numérica. Este es el target
df_train_a['Price_euros'].unique()


array([ 739.  ,  478.89, 1900.  , 2999.  , 1262.4 , 1089.  ,  368.  ,
        529.  ,  499.  ,  559.  , 1329.  ,  459.  , 2041.  ,  375.  ,
       1559.  ,  949.  , 1219.24, 2199.  , 1179.  ,  202.9 ,  659.  ,
        359.  , 1350.  ,  649.  ,  657.  , 1272.  , 1398.  , 2868.99,
        800.  , 3299.  ,  549.  ,  941.  , 1099.  ,  955.  , 1899.  ,
       1196.  , 2027.42,  614.  , 1008.52,  539.  , 1725.  , 1169.  ,
       1363.  , 2099.  , 2824.  ,  889.  ,  899.  , 2370.  , 1799.  ,
        639.  , 1369.  ,  910.  ,  860.  ,  579.  , 1199.  , 2712.  ,
       1299.  ,  737.  , 1079.  ,  389.  ,  775.  , 2229.  , 1144.  ,
       1476.11,  902.  , 1323.  , 2449.  ,  735.87,  599.9 ,  349.  ,
        387.  , 2299.  , 1165.  , 1055.  ,  409.  ,  520.9 ,  585.  ,
       1377.  ,  447.  ,  439.  , 3499.  , 1867.85, 1258.  ,  879.  ,
       1145.  , 1499.  ,  274.9 , 1244.  ,  344.  , 1210.  ,  355.  ,
        685.  ,  465.  , 1820.  , 1142.4 ,  469.01,  716.  , 1349.  ,
       1119.  , 1399

# Observación final

In [91]:
# Modificamos los nombres para que se ajusten más al contenido de las columnas y se lean mejor
df_train_a = df_train_a.rename(columns={'laptop_ID': 'Laptop (ID)',
                                        'Company': 'Tech price by country',
                                        'TypeName': 'Laptop type price',
                                        'ScreenResolution': 'Resolution quality',
                                        'Cpu': 'CPU (GHz)',
                                        'Ram': 'Ram (GB)',
                                        'Gpu': 'GPU (GHz)',
                                        'Weight': 'Weight (kg)',
                                        'Price_euros': 'Price (euros)',
                                        'Inches': 'Sceen size (inches)' 
                                        }
                                )


In [92]:
# Las pasamos a numérico para optimizar memoria y unificar
for column in df_train_a:
    df_train_a[column] = pd.to_numeric(df_train_a[column], errors='ignore', downcast='integer')


In [93]:
df_train_a.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Laptop (ID)                912 non-null    int16  
 1   Tech price by country      912 non-null    int8   
 2   Laptop type price          912 non-null    int8   
 3   Sceen size (inches)        912 non-null    float64
 4   Resolution quality         912 non-null    int8   
 5   CPU (GHz)                  912 non-null    float64
 6   Ram (GB)                   912 non-null    int8   
 7   GPU (GHz)                  912 non-null    float64
 8   Weight (kg)                912 non-null    float64
 9   Price (euros)              912 non-null    float64
 10  Touchscreen                912 non-null    int8   
 11  IPS Panel                  912 non-null    int8   
 12  Retina Display             912 non-null    int8   
 13  Flash Storage memory (GB)  912 non-null    int16  

In [94]:
df_train_a.head()


Unnamed: 0,Laptop (ID),Tech price by country,Laptop type price,Sceen size (inches),Resolution quality,CPU (GHz),Ram (GB),GPU (GHz),Weight (kg),Price (euros),Touchscreen,IPS Panel,Retina Display,Flash Storage memory (GB),SSD memory (GB),HDD memory (GB),Hybrid memory (GB),Android,Chrome OS,Linux,Mac OS X,No OS,Windows 10,Windows 10 S,Windows 7,macOS
0,597,0,2,15.6,0,2.5,8,0.3,2.2,739.0,1,0,0,0,0,1000,0,0,0,0,0,0,1,0,0,0
1,1173,4,1,15.6,1,2.5,4,1.03,1.91,478.89,0,0,0,0,256,0,0,0,0,0,0,0,1,0,0,0
2,1274,0,5,17.3,1,2.6,16,0.92,4.0,1900.0,0,1,0,0,128,1000,0,0,0,0,0,0,1,0,0,0
3,252,0,5,17.3,1,2.9,16,1.57,3.6,2999.0,0,0,0,0,256,0,0,0,0,0,0,0,1,0,0,0
4,15,4,3,12.0,2,1.2,8,0.3,0.92,1262.4,0,1,1,0,256,0,0,0,0,0,0,0,0,0,0,1


In [95]:
df_train_a.to_csv('df_train_a_cleaned.csv', index=False)
