# Tu mejor portatil

## Competición Kaggle

Limpiar el Test para que este en el mismo formato que el train

### Importaciones

In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error, mean_squared_error


## Funciones


In [47]:
def extraer_ssd_gb(memory):
    if not isinstance(memory, str):
        return 0  # si hay un valor nulo
    
    # dividir las que tienen +
    parts = memory.split("+")
    
    total_ssd = 0
    for part in parts:
        part = part.strip()  # quitar espacios al principio/final
        
        # solo nos interesa si tiene "SSD"
        if "SSD" in part:
            # quitar "SSD" y espacios
            num_unit = part.replace("SSD", "").strip()
            
            # detectar unidad
            if "TB" in num_unit:
                num = float(num_unit.replace("TB","").strip())
                total_ssd += num * 1024  # convertir TB a GB
            else:  # asumimos GB
                num = float(num_unit.replace("GB","").strip())
                total_ssd += num
    return total_ssd

def extraer_hdd_gb(memory):
    
    if not isinstance(memory, str):
        return 0
    
    parts = memory.split("+")
    total_hdd = 0
    
    for part in parts:
        part = part.strip()
        
        if "HDD" in part:
            num_unit = part.replace("HDD", "").strip()
            
            if "TB" in num_unit:
                num = float(num_unit.replace("TB","").strip())
                total_hdd += num * 1024
            else:
                num = float(num_unit.replace("GB","").strip())
                total_hdd += num
    return total_hdd

def extraer_hybrid_gb(memory):
    if not isinstance(memory, str):
        return 0
    
    parts = memory.split("+")
    total_hybrid = 0
    
    for part in parts:
        part = part.strip()
        
        if "Hybrid" in part:
            num_unit = part.replace("Hybrid","").strip()
            
            if "TB" in num_unit:
                num = float(num_unit.replace("TB","").strip())
                total_hybrid += num * 1024
            else:
                num = float(num_unit.replace("GB","").strip())
                total_hybrid += num
                
    return total_hybrid

def extraer_flash_gb(memory):
    if not isinstance(memory, str):
        return 0
    
    parts = memory.split("+")
    total_flash = 0
    
    for part in parts:
        part = part.strip()
        
        if "Flash Storage" in part:
            num_unit = part.replace("Flash Storage","").strip()
            
            if "TB" in num_unit:
                num = float(num_unit.replace("TB","").strip())
                total_flash += num * 1024
            else:
                num = float(num_unit.replace("GB","").strip())
                total_flash += num
                
    return total_flash

def cpu_marca(cpu_str):
    if "Intel" in cpu_str:
        return "Intel"
    elif "AMD" in cpu_str:
        return "AMD"
    else:
        return "Other"

def cpu_serie(cpu_str):
    lista_series = ["i3", "i5", "i7", "i9", "M", "Atom", "Celeron", "Pentium", "Xeon",
                   "A6", "A8", "A9", "A10", "FX", "Ryzen", "E-Series"]
    for serie in lista_series:
        if serie in cpu_str:
            return serie
    return "Other"

def cpu_ghz(cpu_str):
    try:
        for part in cpu_str.split(): #parte el string
            if "GHz" in part: #coge la parte que tiene GHz
                return float(part.replace("GHz", "")) #le quita el GHz y lo convierte en float
        return 0.0 #si no tiene GHz da 0.0
    except:
        return 0.0 #si da error da 0.0
    
def gpu_marca(gpu_str):
    if "Intel" in gpu_str:
        return "Intel"
    elif "Nvidia" in gpu_str:
        return "Nvidia"
    elif "AMD" in gpu_str:
        return "AMD"
    else:
        return "Other"

def gpu_tipo(gpu_str):
    gpu = gpu_str.strip()  # elimina espacios al inicio/final
    if "Intel" in gpu:
        if "HD Graphics" in gpu:
            return "HD Graphics"
        elif "UHD Graphics" in gpu:
            return "UHD Graphics"
        elif "Iris Plus" in gpu:
            return "Iris Plus"
        elif "Iris" in gpu:
            return "Iris"
        else:
            return "Other Intel"
    elif "Nvidia" in gpu:
        if "GeForce" in gpu:
            return "GeForce"
        elif "Quadro" in gpu:
            return "Quadro"
        else:
            return "Other Nvidia"
    elif "AMD" in gpu:
        if "Radeon" in gpu:
            return "Radeon"
        elif "RX" in gpu:
            return "RX"
        elif "FirePro" in gpu:
            return "FirePro"
        else:
            return "Other AMD"
    else:
        return "Other"
    



### Lectura de CSV y primeros pasos

In [48]:
df = pd.read_csv("data/test.csv")
df

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,181,1098,HP,Spectre x360,Ultrabook,13.3,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7500U 2.7GHz,16GB,512GB SSD,Intel HD Graphics 620,Windows 10,1.3kg
1,708,330,Acer,Aspire 5,Notebook,15.6,1366x768,AMD A12-Series 9720P 2.7GHz,8GB,256GB SSD,AMD Radeon RX 540,Windows 10,2.2kg
2,862,1260,Acer,Aspire ES1-572,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4GB,500GB HDD,Intel HD Graphics 520,Linux,2.4kg
3,1064,1137,HP,EliteBook 1040,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 7,1.43kg
4,702,1015,HP,ENVY -,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.34kg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,1281,145,Lenovo,Legion Y520-15IKBN,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,256GB SSD,Nvidia GeForce GTX 1050M,No OS,2.4kg
387,524,1195,Lenovo,IdeaPad Y700-15ISK,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,16GB,512GB SSD,Nvidia GeForce GTX 960,Windows 10,3.31kg
388,1015,1070,HP,250 G5,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,4GB,500GB HDD,Intel HD Graphics 620,No OS,1.96kg
389,1236,104,HP,15-bw000nv (E2-9000e/4GB/500GB/Radeon,Notebook,15.6,Full HD 1920x1080,AMD E-Series E2-9000e 1.5GHz,4GB,500GB HDD,AMD Radeon R2,Windows 10,2.1kg


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                391 non-null    int64  
 1   laptop_ID         391 non-null    int64  
 2   Company           391 non-null    object 
 3   Product           391 non-null    object 
 4   TypeName          391 non-null    object 
 5   Inches            391 non-null    float64
 6   ScreenResolution  391 non-null    object 
 7   Cpu               391 non-null    object 
 8   Ram               391 non-null    object 
 9   Memory            391 non-null    object 
 10  Gpu               391 non-null    object 
 11  OpSys             391 non-null    object 
 12  Weight            391 non-null    object 
dtypes: float64(1), int64(2), object(10)
memory usage: 39.8+ KB


In [50]:
df.columns

Index(['id', 'laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight'],
      dtype='object')

De todas estas columnas, hay que saber cuales son importantes para el precio de un laptop. Investigando un poco:

1. CPU
2. GPU
3. RAM
4. SSD_GB
5. PPI (Esto es resolucion por pulgadas)
6. Memoria / HDD_GB / Hybrid_GB / Flash_GB
7. Notebook, gaming...
8. SO
9. Peso
10. Marca

De primeras: columnas como RAM y Weight deberian ser Int y Float y en Memory hay que mirar si estan los valores en TB o GB porque 1TB es mas que 256 GB y podria dar a error. 

In [51]:
df["Ram"].unique()

array(['16GB', '8GB', '4GB', '64GB', '12GB', '6GB', '32GB', '2GB', '24GB'],
      dtype=object)

In [52]:
df["Ram"] = df["Ram"].str.replace("GB", "").astype(float)


In [53]:
df

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,181,1098,HP,Spectre x360,Ultrabook,13.3,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7500U 2.7GHz,16.0,512GB SSD,Intel HD Graphics 620,Windows 10,1.3kg
1,708,330,Acer,Aspire 5,Notebook,15.6,1366x768,AMD A12-Series 9720P 2.7GHz,8.0,256GB SSD,AMD Radeon RX 540,Windows 10,2.2kg
2,862,1260,Acer,Aspire ES1-572,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4.0,500GB HDD,Intel HD Graphics 520,Linux,2.4kg
3,1064,1137,HP,EliteBook 1040,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8.0,256GB SSD,Intel HD Graphics 520,Windows 7,1.43kg
4,702,1015,HP,ENVY -,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,Intel HD Graphics 620,Windows 10,1.34kg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,1281,145,Lenovo,Legion Y520-15IKBN,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8.0,256GB SSD,Nvidia GeForce GTX 1050M,No OS,2.4kg
387,524,1195,Lenovo,IdeaPad Y700-15ISK,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,16.0,512GB SSD,Nvidia GeForce GTX 960,Windows 10,3.31kg
388,1015,1070,HP,250 G5,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,4.0,500GB HDD,Intel HD Graphics 620,No OS,1.96kg
389,1236,104,HP,15-bw000nv (E2-9000e/4GB/500GB/Radeon,Notebook,15.6,Full HD 1920x1080,AMD E-Series E2-9000e 1.5GHz,4.0,500GB HDD,AMD Radeon R2,Windows 10,2.1kg


In [54]:
df["Weight"].unique()

array(['1.3kg', '2.2kg', '2.4kg', '1.43kg', '1.34kg', '2kg', '1.95kg',
       '2.1kg', '3.58kg', '1.42kg', '1.32kg', '1.86kg', '1.35kg', '1.6kg',
       '1.1kg', '1.25kg', '1.48kg', '1.37kg', '1.16kg', '3.74kg',
       '1.21kg', '2.9kg', '1.4kg', '1.99kg', '3kg', '2.18kg', '1.47kg',
       '1.8kg', '4.42kg', '1.7kg', '1.45kg', '2.43kg', '1.9kg', '1.05kg',
       '2.15kg', '4.7kg', '2.3kg', '2.5kg', '1.62kg', '1.49kg', '1.44kg',
       '1.5kg', '1.36kg', '1.84kg', '1.252kg', '4.5kg', '2.36kg', '2.8kg',
       '3.42kg', '2.23kg', '1.74kg', '2.17kg', '2.25kg', '2.04kg',
       '3.14kg', '3.35kg', '3.2kg', '2.6kg', '1.91kg', '1.65kg', '1.29kg',
       '0.98kg', '1.64kg', '1.2kg', '2.02kg', '1.55kg', '2.31kg',
       '1.71kg', '2.24kg', '3.78kg', '1.54kg', '2.0kg', '2.19kg', '4.3kg',
       '2.7kg', '1.24kg', '0.91kg', '4.36kg', '1.26kg', '1.58kg',
       '1.83kg', '1.59kg', '2.591kg', '2.59kg', '3.31kg', '2.06kg',
       '1.13kg', '1.08kg', '2.03kg', '1.18kg', '2.33kg', '2.16kg',
       '2

In [55]:
df["Weight"] = df["Weight"].str.replace("kg", "").astype(float)

In [56]:
df

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,181,1098,HP,Spectre x360,Ultrabook,13.3,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7500U 2.7GHz,16.0,512GB SSD,Intel HD Graphics 620,Windows 10,1.30
1,708,330,Acer,Aspire 5,Notebook,15.6,1366x768,AMD A12-Series 9720P 2.7GHz,8.0,256GB SSD,AMD Radeon RX 540,Windows 10,2.20
2,862,1260,Acer,Aspire ES1-572,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4.0,500GB HDD,Intel HD Graphics 520,Linux,2.40
3,1064,1137,HP,EliteBook 1040,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8.0,256GB SSD,Intel HD Graphics 520,Windows 7,1.43
4,702,1015,HP,ENVY -,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,Intel HD Graphics 620,Windows 10,1.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,1281,145,Lenovo,Legion Y520-15IKBN,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8.0,256GB SSD,Nvidia GeForce GTX 1050M,No OS,2.40
387,524,1195,Lenovo,IdeaPad Y700-15ISK,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,16.0,512GB SSD,Nvidia GeForce GTX 960,Windows 10,3.31
388,1015,1070,HP,250 G5,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,4.0,500GB HDD,Intel HD Graphics 620,No OS,1.96
389,1236,104,HP,15-bw000nv (E2-9000e/4GB/500GB/Radeon,Notebook,15.6,Full HD 1920x1080,AMD E-Series E2-9000e 1.5GHz,4.0,500GB HDD,AMD Radeon R2,Windows 10,2.10


Con memoria creo una columnna para el tipo de memoria y otra para la cantidad

In [57]:
df["Memory"].unique()

array(['512GB SSD', '256GB SSD', '500GB HDD', '1TB HDD', '1TB SSD',
       '32GB Flash Storage', '128GB SSD', '128GB SSD +  1TB HDD',
       '256GB SSD +  1TB HDD', '64GB Flash Storage',
       '512GB SSD +  1TB HDD', '2TB HDD', '32GB SSD', '1.0TB Hybrid',
       '128GB SSD +  2TB HDD', '256GB SSD +  256GB SSD',
       '256GB SSD +  2TB HDD', '256GB Flash Storage',
       '512GB SSD +  2TB HDD', '128GB HDD', '16GB Flash Storage',
       '508GB Hybrid', '512GB SSD +  256GB SSD', '128GB Flash Storage'],
      dtype=object)

hay diferentes tipos y algunos estan combinados. Hay que separar por SSD, HDD, Flash, Hybrid y ademas ver las que tienen 2 tipos. Ojo tb con el 1.0. Ademas paso 1TB a 1024 GB y asi tengo todo en la misma unidad

In [58]:
df["SSD_GB"] = df["Memory"].apply(extraer_ssd_gb)

In [59]:
df["HDD_GB"] = df["Memory"].apply(extraer_hdd_gb)

In [60]:
df["Hybrid_GB"] = df["Memory"].apply(extraer_hybrid_gb)

In [61]:
df["Flash_GB"] = df["Memory"].apply(extraer_flash_gb)

In [62]:
df[["Memory","SSD_GB","HDD_GB","Hybrid_GB","Flash_GB"]].sample(20)


Unnamed: 0,Memory,SSD_GB,HDD_GB,Hybrid_GB,Flash_GB
104,1.0TB Hybrid,0.0,0.0,1024.0,0.0
110,128GB SSD,128.0,0.0,0.0,0.0
66,500GB HDD,0.0,500.0,0.0,0.0
154,256GB SSD,256.0,0.0,0.0,0.0
170,512GB SSD,512.0,0.0,0.0,0.0
115,256GB SSD + 1TB HDD,256.0,1024.0,0.0,0.0
11,500GB HDD,0.0,500.0,0.0,0.0
374,32GB Flash Storage,0.0,0.0,0.0,32.0
8,1TB SSD,1024.0,0.0,0.0,0.0
366,500GB HDD,0.0,500.0,0.0,0.0


In [63]:
df["Hybrid_GB"].unique()

array([   0., 1024.,  508.])

In [64]:
df["Flash_GB"].unique()

array([  0.,  32.,  64., 256.,  16., 128.])

In [65]:
df.head(20)

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,SSD_GB,HDD_GB,Hybrid_GB,Flash_GB
0,181,1098,HP,Spectre x360,Ultrabook,13.3,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7500U 2.7GHz,16.0,512GB SSD,Intel HD Graphics 620,Windows 10,1.3,512.0,0.0,0.0,0.0
1,708,330,Acer,Aspire 5,Notebook,15.6,1366x768,AMD A12-Series 9720P 2.7GHz,8.0,256GB SSD,AMD Radeon RX 540,Windows 10,2.2,256.0,0.0,0.0,0.0
2,862,1260,Acer,Aspire ES1-572,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4.0,500GB HDD,Intel HD Graphics 520,Linux,2.4,0.0,500.0,0.0,0.0
3,1064,1137,HP,EliteBook 1040,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8.0,256GB SSD,Intel HD Graphics 520,Windows 7,1.43,256.0,0.0,0.0,0.0
4,702,1015,HP,ENVY -,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,Intel HD Graphics 620,Windows 10,1.34,256.0,0.0,0.0,0.0
5,585,1220,Dell,Inspiron 5579,2 in 1 Convertible,15.6,Full HD / Touchscreen 1920x1080,Intel Core i7 8550U 1.8GHz,16.0,512GB SSD,Intel UHD Graphics 620,Windows 10,2.0,512.0,0.0,0.0,0.0
6,195,787,Razer,Blade Pro,Gaming,14.0,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16.0,512GB SSD,Nvidia GeForce GTX 1060,Windows 10,1.95,512.0,0.0,0.0,0.0
7,463,841,Asus,VivoBook Max,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,4.0,1TB HDD,Nvidia GeForce 920,Linux,2.1,0.0,1024.0,0.0,0.0
8,14,1081,Asus,ROG G701VO,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6820HK 2.7GHz,64.0,1TB SSD,Nvidia GeForce GTX 980,Windows 10,3.58,1024.0,0.0,0.0,0.0
9,621,701,Lenovo,ThinkPad X1,2 in 1 Convertible,14.0,Touchscreen 2560x1440,Intel Core i7 7500U 2.7GHz,8.0,256GB SSD,Intel HD Graphics 620,Windows 10,1.42,256.0,0.0,0.0,0.0


Aparte de las memorias, otros puntos importantes para el precio de un laptop seran la pantalla, la grafica y la cpu

### Pantalla. 
Limpiar Screen Resolution para ver que tipos hay y que resoluciones

In [66]:
df["ScreenResolution"].unique()

array(['IPS Panel 4K Ultra HD 3840x2160', '1366x768', 'Full HD 1920x1080',
       'IPS Panel Full HD 1920x1080', 'Full HD / Touchscreen 1920x1080',
       'Touchscreen 2560x1440',
       'IPS Panel Full HD / Touchscreen 1920x1080',
       'Quad HD+ / Touchscreen 3200x1800',
       'IPS Panel Retina Display 2736x1824', 'Quad HD+ 3200x1800',
       'IPS Panel Full HD 1366x768', 'IPS Panel Full HD 2160x1440',
       'IPS Panel Quad HD+ 3200x1800', 'Touchscreen 2256x1504',
       '1600x900', 'IPS Panel 1366x768',
       'IPS Panel 4K Ultra HD / Touchscreen 3840x2160',
       'IPS Panel Retina Display 2560x1600', '2560x1440',
       'IPS Panel Touchscreen 1366x768',
       'IPS Panel Retina Display 2880x1800',
       '4K Ultra HD / Touchscreen 3840x2160', 'IPS Panel 2560x1440',
       'IPS Panel Quad HD+ 2560x1440', 'IPS Panel Touchscreen 2560x1440',
       '1440x900', '4K Ultra HD 3840x2160',
       'Touchscreen / Quad HD+ 3200x1800',
       'IPS Panel Quad HD+ / Touchscreen 3200x1800',
  

La resolucion va al final de cada linea. 

In [67]:
df['ScreenResolution_clean'] = df['ScreenResolution'].apply(lambda x: x.split()[-1])


In [68]:
df[['Width_px', 'Height_px']] = df['ScreenResolution_clean'].str.split('x', expand=True)


In [69]:
df['Width_px'] = df['Width_px'].astype(int)
df['Height_px'] = df['Height_px'].astype(int)


con Width_px, Height_px y Inches calculo el PPI 

In [70]:
df['PPI'] = ((df['Width_px']**2 + df['Height_px']**2)**0.5) / df['Inches']

Ahora, tipo de pantalla IPS, Touchscreen. Y despues el full HD de la misma manera

In [71]:
df['IPS'] = df['ScreenResolution'].str.contains('IPS', case=False).astype(int)
df['Touchscreen'] = df['ScreenResolution'].str.contains('Touchscreen', case=False).astype(int)
df['FullHD'] = df['ScreenResolution'].str.contains('Full HD', case=False).astype(int)
df['QuadHD'] = df['ScreenResolution'].str.contains('Quad HD', case=False).astype(int)
df['UltraHD_4K'] = df['ScreenResolution'].str.contains('4K|Ultra HD', case=False).astype(int)
df['Retina'] = df['ScreenResolution'].str.contains('Retina', case=False).astype(int)



In [72]:
df.head()

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,...,ScreenResolution_clean,Width_px,Height_px,PPI,IPS,Touchscreen,FullHD,QuadHD,UltraHD_4K,Retina
0,181,1098,HP,Spectre x360,Ultrabook,13.3,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7500U 2.7GHz,16.0,512GB SSD,...,3840x2160,3840,2160,331.264236,1,0,0,0,1,0
1,708,330,Acer,Aspire 5,Notebook,15.6,1366x768,AMD A12-Series 9720P 2.7GHz,8.0,256GB SSD,...,1366x768,1366,768,100.45467,0,0,0,0,0,0
2,862,1260,Acer,Aspire ES1-572,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4.0,500GB HDD,...,1366x768,1366,768,100.45467,0,0,0,0,0,0
3,1064,1137,HP,EliteBook 1040,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8.0,256GB SSD,...,1920x1080,1920,1080,157.350512,0,0,1,0,0,0
4,702,1015,HP,ENVY -,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,...,1920x1080,1920,1080,165.632118,1,0,1,0,0,0


### CPU

Limpiar el string CPU. Sacar fabricante, tipo de procesador y frecuencia

In [73]:
df["Cpu"].unique()

array(['Intel Core i7 7500U 2.7GHz', 'AMD A12-Series 9720P 2.7GHz',
       'Intel Core i3 6006U 2.0GHz', 'Intel Core i5 6200U 2.3GHz',
       'Intel Core i5 7200U 2.5GHz', 'Intel Core i7 8550U 1.8GHz',
       'Intel Core i7 7700HQ 2.8GHz', 'Intel Core i7 6820HK 2.7GHz',
       'Intel Atom X5-Z8350 1.44GHz', 'Intel Core i3 6006U 2GHz',
       'Intel Core M m7-6Y75 1.2GHz',
       'Intel Celeron Dual Core N3350 2.0GHz',
       'Intel Core i7 6600U 2.6GHz', 'Intel Core i7 6500U 2.5GHz',
       'Intel Core M 6Y54 1.1GHz', 'Intel Core i5 8250U 1.6GHz',
       'Intel Celeron Quad Core N3450 1.1GHz',
       'Intel Core i5 7300HQ 2.5GHz', 'AMD A6-Series 9220 2.5GHz',
       'Intel Core i7 6700HQ 2.6GHz',
       'Intel Celeron Quad Core N3710 1.6GHz',
       'AMD A10-Series 9620P 2.5GHz', 'Intel Core i7 7820HK 2.9GHz',
       'Intel Celeron Dual Core N3350 1.1GHz',
       'Intel Pentium Quad Core N3700 1.6GHz',
       'Intel Celeron Dual Core N3050 1.6GHz',
       'Intel Core i5 7300U 2.6GHz', 

Hay 2 marcas. Intel y AMD. Tipos de procesador hay bastantes. La frecuencia siempre va al final y acaba en GHz. 

Marcas

In [74]:
df["CPU_Marca"] = df["Cpu"].apply(cpu_marca)

In [75]:
df["CPU_Marca"].count()

np.int64(391)

In [76]:
df["CPU_Serie"] = df["Cpu"].apply(cpu_serie)

In [77]:
df["CPU_Serie"].count()

np.int64(391)

In [78]:
df["CPU_GHz"] = df["Cpu"].apply(cpu_ghz)

In [79]:
df["CPU_GHz"].count()

np.int64(391)

### GPU



In [80]:
df["Gpu"].unique()

array(['Intel HD Graphics 620', 'AMD Radeon RX 540',
       'Intel HD Graphics 520', 'Intel UHD Graphics 620',
       'Nvidia GeForce GTX 1060', 'Nvidia GeForce 920',
       'Nvidia GeForce GTX 980 ', 'AMD Radeon 520',
       'Intel HD Graphics 400', 'Intel HD Graphics 515',
       'Intel HD Graphics 500', 'AMD Radeon RX 580',
       'Nvidia GeForce GTX 1050', 'AMD Radeon R4 Graphics',
       'AMD Radeon R5 M420', 'Intel HD Graphics 405', 'AMD Radeon 530',
       'Nvidia GeForce GTX 1050 Ti', 'Nvidia GeForce GTX 1070',
       'Nvidia GeForce 940MX', 'Nvidia GeForce 940M',
       'Nvidia GeForce GTX 1080', 'Nvidia GeForce GTX 930MX',
       'Nvidia GeForce 930MX ', 'Intel HD Graphics',
       'Nvidia GeForce GTX 1050M', 'Nvidia GeForce GTX 980M',
       'AMD Radeon R7 M445', 'Nvidia Quadro M1200', 'Nvidia Quadro M620',
       'AMD Radeon R5 M430', 'Nvidia GeForce 930MX',
       'Intel Iris Plus Graphics 640', 'Nvidia GeForce 920MX ',
       'Nvidia Quadro M2200', 'AMD Radeon R5', 'Intel

Mismo caso que con las CPU. Hay qye sacar la marca y el modelo

In [81]:
df["GPU_Marca"] = df["Gpu"].apply(gpu_marca)

In [82]:
df["GPU_Marca"].count()

np.int64(391)

In [83]:
df["GPU_Tipo"] = df["Gpu"].apply(gpu_tipo)

In [84]:
df["GPU_Tipo"].count()

np.int64(391)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      391 non-null    int64  
 1   laptop_ID               391 non-null    int64  
 2   Company                 391 non-null    object 
 3   Product                 391 non-null    object 
 4   TypeName                391 non-null    object 
 5   Inches                  391 non-null    float64
 6   ScreenResolution        391 non-null    object 
 7   Cpu                     391 non-null    object 
 8   Ram                     391 non-null    float64
 9   Memory                  391 non-null    object 
 10  Gpu                     391 non-null    object 
 11  OpSys                   391 non-null    object 
 12  Weight                  391 non-null    float64
 13  SSD_GB                  391 non-null    float64
 14  HDD_GB                  391 non-null    fl

In [87]:
df.to_csv("data/test_trabajado.csv", index = False)