# Creación de Modelos de ML

## Importaciones

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


## Carga del CSV de test trabajado

In [2]:
df = pd.read_csv("data/test_trabajado.csv" )
df.head()

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,...,Touchscreen,FullHD,QuadHD,UltraHD_4K,Retina,CPU_Marca,CPU_Serie,CPU_GHz,GPU_Marca,GPU_Tipo
0,181,1098,HP,Spectre x360,Ultrabook,13.3,IPS Panel 4K Ultra HD 3840x2160,Intel Core i7 7500U 2.7GHz,16.0,512GB SSD,...,0,0,0,1,0,Intel,i7,2.7,Intel,HD Graphics
1,708,330,Acer,Aspire 5,Notebook,15.6,1366x768,AMD A12-Series 9720P 2.7GHz,8.0,256GB SSD,...,0,0,0,0,0,AMD,M,2.7,AMD,Radeon
2,862,1260,Acer,Aspire ES1-572,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4.0,500GB HDD,...,0,0,0,0,0,Intel,i3,2.0,Intel,HD Graphics
3,1064,1137,HP,EliteBook 1040,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8.0,256GB SSD,...,0,1,0,0,0,Intel,i5,2.3,Intel,HD Graphics
4,702,1015,HP,ENVY -,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,...,0,1,0,0,0,Intel,i5,2.5,Intel,HD Graphics


## Escalado

Escalar las columnas numericas con StandardScaler para trabajar con valores escalados

In [3]:
df.columns

Index(['id', 'laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'SSD_GB', 'HDD_GB', 'Hybrid_GB', 'Flash_GB', 'ScreenResolution_clean',
       'Width_px', 'Height_px', 'PPI', 'IPS', 'Touchscreen', 'FullHD',
       'QuadHD', 'UltraHD_4K', 'Retina', 'CPU_Marca', 'CPU_Serie', 'CPU_GHz',
       'GPU_Marca', 'GPU_Tipo'],
      dtype='object')

In [4]:
lista_columnas_escala = ["Ram", "Weight", "SSD_GB", "HDD_GB", "Hybrid_GB", "Flash_GB", "PPI", "CPU_GHz"]

In [5]:
scaler = StandardScaler()

In [6]:
df[lista_columnas_escala] = scaler.fit_transform(df[lista_columnas_escala])

In [7]:
df[lista_columnas_escala].head()

Unnamed: 0,Ram,Weight,SSD_GB,HDD_GB,Hybrid_GB,Flash_GB,PPI,CPU_GHz
0,1.311382,-1.090284,1.651244,-0.817083,-0.084528,-0.168265,4.276645,0.768848
1,-0.152573,0.210807,0.32788,-0.817083,-0.084528,-0.168265,-1.099063,0.768848
2,-0.884551,0.499939,-0.995485,0.085932,-0.084528,-0.168265,-1.099063,-0.663063
3,-0.152573,-0.902349,0.32788,-0.817083,-0.084528,-0.168265,0.226079,-0.049387
4,-0.152573,-1.032458,0.32788,-0.817083,-0.084528,-0.168265,0.418963,0.35973


Una vez escalados los numericos hay que ver como afectan los no numericos

In [8]:
df["Company"].value_counts()

Company
Dell         90
Lenovo       89
HP           85
Asus         50
Acer         28
MSI          20
Toshiba      11
Apple         4
Samsung       3
Microsoft     3
LG            2
Razer         1
Vero          1
Chuwi         1
Huawei        1
Xiaomi        1
Google        1
Name: count, dtype: int64

Agrupar las marcas que menos frecuencia tienen en un unico grupo

In [9]:
marcas_frecuentes = ["Lenovo", "Dell", "HP", "Asus", "Acer", "Toshiba", "MSI", "Apple"]

df["Company_clean"] = df["Company"].apply(lambda x: x if x in marcas_frecuentes else "Otros")


In [10]:
df["TypeName"].value_counts()


TypeName
Notebook              205
Gaming                 67
Ultrabook              61
2 in 1 Convertible     36
Workstation            11
Netbook                11
Name: count, dtype: int64

In [11]:
tipos_frecuentes = ["Notebook", "Gaming", "Ultrabook", "2 in 1 Convertible"]
df["TypeName_clean"] = df["TypeName"].apply(lambda x: x if x in tipos_frecuentes else "Otros")

Para las CPU trabajamos con las series. Si hace falta mas trabajaremos con las frecuencias
Investigando y comparando series de AMD e Intel, quedaria mas o menos asi.

In [12]:
df["CPU_Serie"].unique()

array(['i7', 'M', 'i3', 'i5', 'Atom', 'Celeron', 'Pentium', 'Other'],
      dtype=object)

In [17]:
df["CPU_Serie"].count()

np.int64(391)

In [22]:
cpu_potencia = {
    "Atom": 0,
    "Celeron": 1,
    "M": 2,
    "Pentium": 3,
    "i3": 4,
    "i5": 5,
    "i7": 6,
    "Other": 3
}


In [23]:
df["CPU_Potencia"] = df["CPU_Serie"].map(cpu_potencia)


In [24]:
df["CPU_Potencia"].count()

np.int64(391)

Ahora las GPU

In [25]:
df["GPU_Tipo"].value_counts()

GPU_Tipo
HD Graphics    197
GeForce        121
Radeon          56
Quadro          11
Iris Plus        2
FirePro          2
Iris             1
Other            1
Name: count, dtype: int64

In [26]:
gpu_potencia = {
    "HD Graphics": 1,
    "Iris": 2,
    "Iris Plus": 2,
    "Radeon": 3,
    "GeForce": 4,
    "Quadro": 5,
    "FirePro": 5,
    "Other AMD": 3,
    "Other Nvidia": 4,
    "Other Intel": 1,
    "Other": 3
}


In [27]:
df["GPU_Potencia"] = df["GPU_Tipo"].map(gpu_potencia)

Ahora volvemos a escalar todo

In [28]:
columnas_a_escalar = [
    "Ram", "Weight", "SSD_GB", "HDD_GB", "Hybrid_GB", "Flash_GB",
    "PPI", "CPU_GHz", "CPU_Potencia", "GPU_Potencia"
]

In [29]:
df[columnas_a_escalar] = scaler.fit_transform(df[columnas_a_escalar])

In [30]:
df[columnas_a_escalar].head()


Unnamed: 0,Ram,Weight,SSD_GB,HDD_GB,Hybrid_GB,Flash_GB,PPI,CPU_GHz,CPU_Potencia,GPU_Potencia
0,1.311382,-1.090284,1.651244,-0.817083,-0.084528,-0.168265,4.276645,0.768848,0.76048,-0.948754
1,-0.152573,0.210807,0.32788,-0.817083,-0.084528,-0.168265,-1.099063,0.768848,-1.853567,0.445843
2,-0.884551,0.499939,-0.995485,0.085932,-0.084528,-0.168265,-1.099063,-0.663063,-0.546543,-0.948754
3,-0.152573,-0.902349,0.32788,-0.817083,-0.084528,-0.168265,0.226079,-0.049387,0.106969,-0.948754
4,-0.152573,-1.032458,0.32788,-0.817083,-0.084528,-0.168265,0.418963,0.35973,0.106969,-0.948754


Ahora One Hot Encoding para marca (Company) y tipo de portatil (TypeName)

In [31]:
df_encoded = pd.get_dummies(df, columns=["Company", "TypeName"], drop_first=True)

In [32]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 55 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      391 non-null    int64  
 1   laptop_ID               391 non-null    int64  
 2   Product                 391 non-null    object 
 3   Inches                  391 non-null    float64
 4   ScreenResolution        391 non-null    object 
 5   Cpu                     391 non-null    object 
 6   Ram                     391 non-null    float64
 7   Memory                  391 non-null    object 
 8   Gpu                     391 non-null    object 
 9   OpSys                   391 non-null    object 
 10  Weight                  391 non-null    float64
 11  SSD_GB                  391 non-null    float64
 12  HDD_GB                  391 non-null    float64
 13  Hybrid_GB               391 non-null    float64
 14  Flash_GB                391 non-null    fl

In [34]:
# Columnas numéricas escaladas
num_cols = ["Ram","Weight","SSD_GB","HDD_GB","Hybrid_GB","Flash_GB","PPI","CPU_GHz","CPU_Potencia","GPU_Potencia"]

# Columnas binarias (pantalla)
binary_cols = ["IPS","Touchscreen","FullHD","QuadHD","UltraHD_4K","Retina"]

# Columnas one-hot
company_cols = [col for col in df_encoded.columns if col.startswith("Company_")]
typename_cols = [col for col in df_encoded.columns if col.startswith("TypeName_")]
#Columna target


# Todas las columnas finales
model_cols = num_cols + binary_cols + company_cols + typename_cols

# DataFrame final
df_final = df_encoded[model_cols]

df_final.head()


Unnamed: 0,Ram,Weight,SSD_GB,HDD_GB,Hybrid_GB,Flash_GB,PPI,CPU_GHz,CPU_Potencia,GPU_Potencia,...,Company_Samsung,Company_Toshiba,Company_Vero,Company_Xiaomi,TypeName_clean,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation
0,1.311382,-1.090284,1.651244,-0.817083,-0.084528,-0.168265,4.276645,0.768848,0.76048,-0.948754,...,False,False,False,False,Ultrabook,False,False,False,True,False
1,-0.152573,0.210807,0.32788,-0.817083,-0.084528,-0.168265,-1.099063,0.768848,-1.853567,0.445843,...,False,False,False,False,Notebook,False,False,True,False,False
2,-0.884551,0.499939,-0.995485,0.085932,-0.084528,-0.168265,-1.099063,-0.663063,-0.546543,-0.948754,...,False,False,False,False,Notebook,False,False,True,False,False
3,-0.152573,-0.902349,0.32788,-0.817083,-0.084528,-0.168265,0.226079,-0.049387,0.106969,-0.948754,...,False,False,False,False,Notebook,False,False,True,False,False
4,-0.152573,-1.032458,0.32788,-0.817083,-0.084528,-0.168265,0.418963,0.35973,0.106969,-0.948754,...,False,False,False,False,Notebook,False,False,True,False,False


Elimino las columnas con baja relacion como Flash_GB, Quad... y CPU_GHz por tener mucha correlacion con CPU_Potencia y estar las 2 bien relacionadas con Price

In [39]:
columnas_a_quitar = ["HDD_GB", "Hybrid_GB", "Flash_GB", "Weight", "FullHD", "QuadHD", "Retina", "CPU_GHz","Company_clean","TypeName_clean"]
df_simplificado = df_final.drop(columns=columnas_a_quitar)
df_simplificado.head()

Unnamed: 0,Ram,SSD_GB,PPI,CPU_Potencia,GPU_Potencia,IPS,Touchscreen,UltraHD_4K,Company_Apple,Company_Asus,...,Company_Razer,Company_Samsung,Company_Toshiba,Company_Vero,Company_Xiaomi,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation
0,1.311382,1.651244,4.276645,0.76048,-0.948754,1,0,1,False,False,...,False,False,False,False,False,False,False,False,True,False
1,-0.152573,0.32788,-1.099063,-1.853567,0.445843,0,0,0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,-0.884551,-0.995485,-1.099063,-0.546543,-0.948754,0,0,0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,-0.152573,0.32788,0.226079,0.106969,-0.948754,0,0,0,False,False,...,False,False,False,False,False,False,False,True,False,False
4,-0.152573,0.32788,0.418963,0.106969,-0.948754,1,0,0,False,False,...,False,False,False,False,False,False,False,True,False,False


In [40]:
df_simplificado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Ram                   391 non-null    float64
 1   SSD_GB                391 non-null    float64
 2   PPI                   391 non-null    float64
 3   CPU_Potencia          391 non-null    float64
 4   GPU_Potencia          391 non-null    float64
 5   IPS                   391 non-null    int64  
 6   Touchscreen           391 non-null    int64  
 7   UltraHD_4K            391 non-null    int64  
 8   Company_Apple         391 non-null    bool   
 9   Company_Asus          391 non-null    bool   
 10  Company_Chuwi         391 non-null    bool   
 11  Company_Dell          391 non-null    bool   
 12  Company_Google        391 non-null    bool   
 13  Company_HP            391 non-null    bool   
 14  Company_Huawei        391 non-null    bool   
 15  Company_LG            3

In [41]:
df_simplificado.to_csv("data/datos_a_predecir.csv", index = False)