# COMPETENCIA KAGGLE 1 - XIMENA PEREZ OLOGNERO

## Importación de librerías

In [1]:
import pandas as pd
import numpy as np

import urllib.request
from PIL import Image
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Data

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB


In [21]:
train.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1104,Acer,Aspire ES1-523,Notebook,15.6,1366x768,AMD A8-Series 7410 2.2GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.4kg,387.0
1,114,Dell,XPS 13,Ultrabook,13.3,Quad HD+ / Touchscreen 3200x1800,Intel Core i7 7560U 2.4GHz,8GB,256GB SSD,Intel Iris Plus Graphics 640,Windows 10,1.23kg,1379.0
2,172,Acer,Aspire A517-51G,Notebook,17.3,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Nvidia GeForce MX150,Windows 10,3kg,854.0
3,918,HP,Elitebook 820,Netbook,12.5,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,512GB SSD,Intel HD Graphics 620,Windows 10,1.26kg,1483.0
4,447,Lenovo,IdeaPad 320-15AST,Notebook,17.3,1600x900,AMD A6-Series 9220 2.5GHz,8GB,1TB HDD,AMD Radeon R4,Windows 10,2.8kg,519.0


In [22]:
train.columns

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros'],
      dtype='object')

In [11]:
train['TypeName'].unique()

array(['Notebook', 'Ultrabook', 'Netbook', 'Gaming', '2 in 1 Convertible',
       'Workstation'], dtype=object)

## TRAIN - LIMPIEZA

In [5]:
df = train.copy()

### COLUMNA TypeName -- tn

In [6]:
df_dummies = pd.get_dummies(df['TypeName'], prefix='tn')
df[df_dummies.columns] = df_dummies.astype(int)

### COLUMNA ScreenResolution -- src

In [7]:
df['src'] = df['ScreenResolution'].apply(lambda x: 'HD' if '1366x768' in x else 'Full HD' if '1920x1080' in x else 'Quad HD+' if '3200x1800' in x else '4K Ultra HD' if '3840x2160' in x else 'Other')


In [8]:
df = pd.get_dummies(df, columns=['src'], prefix='scr')


### COLUMNA Ram y Weight

In [45]:
df.columns

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros', 'tn_2 in 1 Convertible', 'tn_Gaming', 'tn_Netbook',
       'tn_Notebook', 'tn_Ultrabook', 'tn_Workstation', 'scr_4K Ultra HD',
       'scr_Full HD', 'scr_HD', 'scr_Other', 'scr_Quad HD+'],
      dtype='object')

In [9]:
df['Ram'] = df['Ram'].map(lambda x:x.strip('GB'))
df['Weight'] = df['Weight'].map(lambda x:x.strip('kg'))

In [10]:
df['Ram'] = df['Ram'].astype(int)
df['Weight'] = df['Weight'].astype(float)

### Columna 'cpu' - 'cpu_ghz', 'cpu_intel' y 'cpu_amd'

In [17]:
# 1. Extraer solo los números de GHz y crear la columna 'cpu_GHz' asignando dichos valores
df['cpu_ghz'] = df['Cpu'].str.extract(r'(\d+\.\d+)GHz').astype(float)

# 2. Crear las columnas 'cpu_intel' y 'cpu_amd' y asignar 0 o 1 según la información en cada fila
df['cpu_intel'] = df['Cpu'].str.contains('Intel').astype(int)
df['cpu_amd'] = df['Cpu'].str.contains('AMD').astype(int)


In [20]:
df.isnull().sum()

laptop_ID                 0
Company                   0
Product                   0
TypeName                  0
Inches                    0
ScreenResolution          0
Cpu                       0
Ram                       0
Memory                    0
Gpu                       0
OpSys                     0
Weight                    0
Price_euros               0
tn_2 in 1 Convertible     0
tn_Gaming                 0
tn_Netbook                0
tn_Notebook               0
tn_Ultrabook              0
tn_Workstation            0
scr_4K Ultra HD           0
scr_Full HD               0
scr_HD                    0
scr_Other                 0
scr_Quad HD+              0
cpu_ghz                  62
cpu_intel                 0
cpu_amd                   0
dtype: int64

In [21]:
median_cpu_GHz = df['cpu_ghz'].median()

# Llena los valores nulos en 'cpu_GHz' con la mediana
df['cpu_ghz'].fillna(median_cpu_GHz, inplace=True)

### Columna 'OpSys' - 'so_windows', 'so_no', 'so_linux', 'so_chrome', 'so_mac'

In [26]:
# Crear columnas binarias
df['so_windows'] = df['OpSys'].str.contains('windows', case=False).astype(int)
df['so_no'] = df['OpSys'].eq('no os').astype(int)
df['so_linux'] = df['OpSys'].str.contains('linux', case=False).astype(int)
df['so_chrome'] = df['OpSys'].str.contains('chrome', case=False).astype(int)
df['so_mac'] = df['OpSys'].str.contains('mac', case=False).astype(int)


   so_windows  so_no  so_linux  so_chrome  so_mac
0           1      0         0          0       0
1           1      0         0          0       0
2           1      0         0          0       0
3           1      0         0          0       0
4           1      0         0          0       0


### Columna 'Gpu' - 

In [30]:
# Crear columnas binarias
df['gpu_intel'] = df['Gpu'].str.contains('intel', case=False).astype(int)
df['gpu_nvidia'] = df['Gpu'].str.contains('nvidia', case=False).astype(int)
df['gpu_amd'] = df['Gpu'].str.contains('amd', case=False).astype(int)

### Columna 'Memory' - 'memory_gb'

In [45]:
import re 

In [48]:
def process_memory(memory_str):
    # Dividir la cadena por el signo '+'
    parts = memory_str.split('+')
    
    # Tomar el primer valor antes del signo '+'
    first_part = parts[0].strip()
    
    # Extraer el primer número antes del signo más
    match = re.search(r'(\d+\.?\d*)', first_part)
    
    if match:
        value = float(match.group())
        
        # Convertir a GB si es TB
        if 'tb' in first_part.lower():
            value *= 1024  # Convertir TB a GB
        
        return value
    else:
        return None

# Aplicar la función a la columna 'Memory'
df['memory_gb'] = df['Memory'].apply(process_memory)

# Verificar los resultados
print(df[['Memory', 'memory_gb']])

                   Memory  memory_gb
0               500GB HDD      500.0
1               256GB SSD      256.0
2               256GB SSD      256.0
3               512GB SSD      512.0
4                 1TB HDD     1024.0
..                    ...        ...
907             256GB SSD      256.0
908    1TB SSD +  1TB HDD     1024.0
909  256GB SSD +  1TB HDD      256.0
910  512GB SSD +  1TB HDD      512.0
911             256GB SSD      256.0

[912 rows x 2 columns]


In [50]:
df['memory_gb'].value_counts()

memory_gb
256.0     357
1024.0    173
128.0     130
512.0      98
500.0      95
32.0       29
64.0       11
2048.0      9
16.0        7
180.0       2
508.0       1
Name: count, dtype: int64

### Columna 'Company' - get_dummies 

In [57]:
# Convertir toda la columna 'Company' a minúsculas
df['Company'] = df['Company'].str.lower()

# Aplicar get_dummies a la columna 'Company'
company_dummies = pd.get_dummies(df['Company'], prefix='company')

# Concatenar las nuevas columnas al DataFrame original
df = pd.concat([df, company_dummies], axis=1)


### Nombres de columnas a minúscula y eliminación de espacios

In [64]:
df.columns = df.columns.str.lower()

In [72]:
# Reemplazar espacios con guiones bajos en los nombres de las columnas
df.columns = df.columns.str.replace(' ', '_')

### Eliminar columnas innecesarias luego de la limpieza

In [69]:
columns_to_drop = ['company', 'typename', 'screenresolution', 'cpu', 'memory', 'gpu', 'opsys']
df = df.drop(columns=columns_to_drop)


In [74]:
df.columns

Index(['laptop_id', 'product', 'inches', 'ram', 'weight', 'price_euros',
       'tn_2_in_1_convertible', 'tn_gaming', 'tn_netbook', 'tn_notebook',
       'tn_ultrabook', 'tn_workstation', 'scr_4k_ultra_hd', 'scr_full_hd',
       'scr_hd', 'scr_other', 'scr_quad_hd+', 'cpu_ghz', 'cpu_intel',
       'cpu_amd', 'so_windows', 'so_no', 'so_linux', 'so_chrome', 'so_mac',
       'gpu_intel', 'gpu_nvidia', 'gpu_amd', 'memory_gb', 'company_acer',
       'company_apple', 'company_asus', 'company_chuwi', 'company_dell',
       'company_fujitsu', 'company_google', 'company_hp', 'company_huawei',
       'company_lenovo', 'company_lg', 'company_mediacom', 'company_microsoft',
       'company_msi', 'company_razer', 'company_samsung', 'company_toshiba',
       'company_vero', 'company_xiaomi'],
      dtype='object')

### Guardar DataFrame limpio

In [77]:
df.to_csv('./data/df_limpieza_Train_Xime.csv')

### df 'train' con columnas definitivas para ML

In [31]:
df = pd.read_csv('./data/df_limpieza_Xime.csv')

In [32]:
train = df[['inches', 'ram', 'weight', 'price_euros','tn_2_in_1_convertible', 'tn_gaming', 'tn_netbook', 'tn_notebook',
       'tn_ultrabook', 'tn_workstation', 'scr_4k_ultra_hd', 'scr_full_hd',
       'scr_hd', 'scr_other', 'scr_quad_hd+', 'cpu_ghz', 'cpu_intel',
       'cpu_amd', 'so_windows', 'so_no', 'so_linux', 'so_chrome', 'so_mac',
       'gpu_intel', 'gpu_nvidia', 'gpu_amd', 'memory_gb', 'company_acer',
       'company_apple', 'company_asus', 'company_chuwi', 'company_dell',
       'company_fujitsu', 'company_google', 'company_hp', 'company_huawei',
       'company_lenovo', 'company_lg', 'company_mediacom', 'company_microsoft',
       'company_msi', 'company_razer', 'company_samsung', 'company_toshiba',
       'company_vero', 'company_xiaomi']]

In [107]:
train.columns

Index(['inches', 'ram', 'weight', 'price_euros', 'tn_2_in_1_convertible',
       'tn_gaming', 'tn_netbook', 'tn_notebook', 'tn_ultrabook',
       'tn_workstation', 'scr_4k_ultra_hd', 'scr_full_hd', 'scr_hd',
       'scr_other', 'scr_quad_hd+', 'cpu_ghz', 'cpu_intel', 'cpu_amd',
       'so_windows', 'so_no', 'so_linux', 'so_chrome', 'so_mac', 'gpu_intel',
       'gpu_nvidia', 'gpu_amd', 'memory_gb', 'company_acer', 'company_apple',
       'company_asus', 'company_chuwi', 'company_dell', 'company_fujitsu',
       'company_google', 'company_hp', 'company_huawei', 'company_lenovo',
       'company_lg', 'company_mediacom', 'company_microsoft', 'company_msi',
       'company_razer', 'company_samsung', 'company_toshiba', 'company_vero',
       'company_xiaomi'],
      dtype='object')

## TRAIN - Entrenando modelos

### Normalizar valores

In [5]:
from sklearn.preprocessing import StandardScaler

# Selecciona las columnas que quieres estandarizar (las primeras en este caso)
columns_to_standardize = ['inches', 'ram', 'weight', 'price_euros']

# Crea un nuevo DataFrame solo con las columnas que quieres estandarizar
data_to_standardize = train[columns_to_standardize]

# Aplica el StandardScaler a las columnas seleccionadas
standard_scaler = StandardScaler()
train[columns_to_standardize] = standard_scaler.fit_transform(data_to_standardize)

# Muestra las primeras 10 filas del DataFrame con datos estandarizados
print(train.head(10))


     inches       ram    weight  price_euros  tn_2_in_1_convertible  \
0  0.435726 -0.885886  0.578996    -1.071514                      0   
1 -1.187970 -0.055965 -1.212277     0.384643                      0   
2  1.635848 -0.055965  1.497597    -0.386005                      0   
3 -1.752733 -0.055965 -1.166347     0.537304                      0   
4  1.635848 -0.055965  1.191397    -0.877751                      0   
5  0.435726  1.603876  0.732096    -0.026369                      0   
6 -0.693801 -0.055965 -0.492706    -0.760319                      0   
7 -0.693801 -0.470925 -0.676426    -0.672245                      0   
8 -1.187970 -0.055965 -1.212277     1.103913                      0   
9  1.635848 -0.055965  2.033448     0.108677                      0   

   tn_gaming  tn_netbook  tn_notebook  tn_ultrabook  tn_workstation  ...  \
0          0           0            1             0               0  ...   
1          0           0            0             1               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[columns_to_standardize] = standard_scaler.fit_transform(data_to_standardize)


In [33]:
X = train.drop(['price_euros'], axis=1)
y = train['price_euros'].copy()

In [34]:
X.shape

(912, 45)

In [35]:
y.shape

(912,)

### 2. Dividir X_train, X_test, y_train, y_test

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [38]:
from sklearn.model_selection import cross_val_score

### Linear Regression

In [39]:
# 2do mejor prediciendo
from sklearn.linear_model import LinearRegression

model = LinearRegression()

### Ridge

In [129]:
# 1ro prediciendo 
from sklearn.linear_model import Ridge

model = Ridge(alpha=10)

### Lasso

In [140]:
from sklearn.linear_model import Lasso

model = Lasso()

### SVM - SVR Regression

In [43]:
from sklearn.svm import SVR

model = SVR()

### Random Forest

In [145]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Instancia el modelo Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Entrena el modelo con tus datos de entrenamiento
model.fit(X_train, y_train)

# Realiza predicciones en el conjunto de prueba
rf_pred = model.predict(X_test)

# Calcula el error cuadrático medio (RMSE)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

# Muestra el RMSE
print('RMSE con Random Forest:', rf_rmse)

RMSE con Random Forest: 326.974144889498


### Cross Validation

In [146]:
scores = -cross_val_score(model, X, y, cv=5, scoring= 'neg_root_mean_squared_error')
np.mean(scores)

310.16314166773157

In [147]:
model.fit(X,y)

In [148]:
pred = model.predict(X_test)
pred

array([1715.86252857,  265.7625    ,  806.465705  , 1407.57148849,
       1654.85716667,  281.952     ,  704.22985   , 1866.0911    ,
        865.87481   , 1487.43      , 1304.851     ,  534.2397    ,
        314.76733333, 1620.92693333, 1666.1969    , 1897.58754762,
       1909.692     , 1248.23895238,  615.48693333,  642.047     ,
        568.39474571,  449.4955    ,  795.0944    ,  729.6256    ,
        955.2       ,  541.4945    , 1198.4296    , 2044.3918    ,
        872.74848   ,  678.11945   , 1408.77      , 1314.16      ,
        797.1271    , 1710.5776    , 1094.1918    ,  386.79223333,
        638.5551    ,  529.79      , 1664.24      ,  800.5301    ,
       1024.096     , 1718.1924    , 1235.17216667, 2917.5055    ,
        900.3288    ,  286.279     , 1110.8098    ,  500.9642    ,
        874.201     ,  406.875     , 1616.4361    ,  933.8572    ,
        469.24406667, 3166.3114    , 1072.746275  , 1870.2312    ,
       1586.918     , 1124.6992    , 2586.4999    , 1975.14083

In [149]:
from sklearn.metrics import mean_squared_error
import numpy as np

print('RMSE :',np.sqrt(mean_squared_error(y_test,pred)))

RMSE : 124.67987839447422


## TEST - LIMPIEZA

In [78]:
X_pred = pd.read_csv("data/test.csv")

### COLUMNA TypeName -- tn

In [79]:
df_dummies = pd.get_dummies(X_pred['TypeName'], prefix='tn')

In [80]:
X_pred[df_dummies.columns] = df_dummies.astype(int)

### COLUMNA ScreenResolution -- src

In [81]:
X_pred['src'] = X_pred['ScreenResolution'].apply(lambda x: 'HD' if '1366x768' in x else 'Full HD' if '1920x1080' in x else 'Quad HD+' if '3200x1800' in x else '4K Ultra HD' if '3840x2160' in x else 'Other')


In [82]:
X_pred = pd.get_dummies(X_pred, columns=['src'], prefix='scr')

In [83]:
scr_columns = [col for col in X_pred.columns if col.startswith('scr')]

In [84]:
# Convertir las columnas a enteros (0 o 1)
X_pred['scr_4K Ultra HD'] = X_pred['scr_4K Ultra HD'].astype(int)
X_pred['scr_Full HD'] = X_pred['scr_Full HD'].astype(int)
X_pred['scr_HD'] = X_pred['scr_HD'].astype(int)
X_pred['scr_Other'] = X_pred['scr_Other'].astype(int)
X_pred['scr_Quad HD+'] = X_pred['scr_Quad HD+'].astype(int)


### COLUMNA Ram y Weight

In [85]:
X_pred['Ram'] = X_pred['Ram'].map(lambda x:x.strip('GB'))

X_pred['Weight'] = X_pred['Weight'].map(lambda x:x.strip('kg'))


In [86]:
X_pred['Ram'] = X_pred['Ram'].astype(int)
X_pred['Weight'] = X_pred['Weight'].astype(float)

### Columna 'cpu' - 'cpu_ghz', 'cpu_intel' y 'cpu_amd'

In [87]:
# 1. Extraer solo los números de GHz y crear la columna 'cpu_GHz' asignando dichos valores
X_pred['cpu_ghz'] = X_pred['Cpu'].str.extract(r'(\d+\.\d+)GHz').astype(float)

# 2. Crear las columnas 'cpu_intel' y 'cpu_amd' y asignar 0 o 1 según la información en cada fila
X_pred['cpu_intel'] = X_pred['Cpu'].str.contains('Intel').astype(int)
X_pred['cpu_amd'] = X_pred['Cpu'].str.contains('AMD').astype(int)

In [88]:
X_pred.isnull().sum()

laptop_ID                 0
Company                   0
Product                   0
TypeName                  0
Inches                    0
ScreenResolution          0
Cpu                       0
Ram                       0
Memory                    0
Gpu                       0
OpSys                     0
Weight                    0
tn_2 in 1 Convertible     0
tn_Gaming                 0
tn_Netbook                0
tn_Notebook               0
tn_Ultrabook              0
tn_Workstation            0
scr_4K Ultra HD           0
scr_Full HD               0
scr_HD                    0
scr_Other                 0
scr_Quad HD+              0
cpu_ghz                  24
cpu_intel                 0
cpu_amd                   0
dtype: int64

In [89]:
median_cpu_GHz = X_pred['cpu_ghz'].median()

# Llena los valores nulos en 'cpu_GHz' con la mediana
X_pred['cpu_ghz'].fillna(median_cpu_GHz, inplace=True)

### Columna 'OpSys' - 'so_windows', 'so_no', 'so_linux', 'so_chrome', 'so_mac'

In [90]:
# Crear columnas binarias
X_pred['so_windows'] = X_pred['OpSys'].str.contains('windows', case=False).astype(int)
X_pred['so_no'] = X_pred['OpSys'].eq('no os').astype(int)
X_pred['so_linux'] = X_pred['OpSys'].str.contains('linux', case=False).astype(int)
X_pred['so_chrome'] = X_pred['OpSys'].str.contains('chrome', case=False).astype(int)
X_pred['so_mac'] = X_pred['OpSys'].str.contains('mac', case=False).astype(int)

### Columna 'Gpu' - 

In [91]:
# Crear columnas binarias
X_pred['gpu_intel'] = X_pred['Gpu'].str.contains('intel', case=False).astype(int)
X_pred['gpu_nvidia'] = X_pred['Gpu'].str.contains('nvidia', case=False).astype(int)
X_pred['gpu_amd'] = X_pred['Gpu'].str.contains('amd', case=False).astype(int)

### Columna 'Memory' - 'memory_gb'

In [92]:
import re 

In [93]:
def process_memory(memory_str):
    # Dividir la cadena por el signo '+'
    parts = memory_str.split('+')
    
    # Tomar el primer valor antes del signo '+'
    first_part = parts[0].strip()
    
    # Extraer el primer número antes del signo más
    match = re.search(r'(\d+\.?\d*)', first_part)
    
    if match:
        value = float(match.group())
        
        # Convertir a GB si es TB
        if 'tb' in first_part.lower():
            value *= 1024  # Convertir TB a GB
        
        return value
    else:
        return None

# Aplicar la función a la columna 'Memory'
X_pred['memory_gb'] = X_pred['Memory'].apply(process_memory)

# Verificar los resultados
print(X_pred[['Memory', 'memory_gb']])

                   Memory  memory_gb
0               256GB SSD      256.0
1                32GB HDD       32.0
2      32GB Flash Storage       32.0
3                 1TB HDD     1024.0
4      32GB Flash Storage       32.0
..                    ...        ...
386  256GB SSD +  1TB HDD      256.0
387             256GB SSD      256.0
388             256GB SSD      256.0
389               1TB HDD     1024.0
390             240GB SSD      240.0

[391 rows x 2 columns]


In [94]:
X_pred['memory_gb'].value_counts()

memory_gb
256.0     151
1024.0     77
128.0      47
512.0      42
500.0      37
32.0       16
2048.0      7
64.0        6
180.0       3
16.0        3
8.0         1
240.0       1
Name: count, dtype: int64

### Columna 'Company' - get_dummies 

In [95]:
# Convertir toda la columna 'Company' a minúsculas
X_pred['Company'] = X_pred['Company'].str.lower()

# Aplicar get_dummies a la columna 'Company'
company_dummies = pd.get_dummies(X_pred['Company'], prefix='company')

# Concatenar las nuevas columnas al DataFrame original
X_pred = pd.concat([X_pred, company_dummies], axis=1)

### Nombres de columnas a minúscula y eliminación de espacios

In [96]:
X_pred.columns = X_pred.columns.str.lower()

In [97]:
# Reemplazar espacios con guiones bajos en los nombres de las columnas
X_pred.columns = X_pred.columns.str.replace(' ', '_')

### Eliminar columnas innecesarias luego de la limpieza

In [100]:
columns_to_drop = ['laptop_id','company', 'product', 'typename', 'screenresolution', 'cpu', 'memory', 'gpu', 'opsys']
X_pred = X_pred.drop(columns=columns_to_drop)


In [101]:
X_pred.columns

Index(['inches', 'ram', 'weight', 'tn_2_in_1_convertible', 'tn_gaming',
       'tn_netbook', 'tn_notebook', 'tn_ultrabook', 'tn_workstation',
       'scr_4k_ultra_hd', 'scr_full_hd', 'scr_hd', 'scr_other', 'scr_quad_hd+',
       'cpu_ghz', 'cpu_intel', 'cpu_amd', 'so_windows', 'so_no', 'so_linux',
       'so_chrome', 'so_mac', 'gpu_intel', 'gpu_nvidia', 'gpu_amd',
       'memory_gb', 'company_acer', 'company_apple', 'company_asus',
       'company_chuwi', 'company_dell', 'company_fujitsu', 'company_hp',
       'company_huawei', 'company_lenovo', 'company_lg', 'company_mediacom',
       'company_msi', 'company_razer', 'company_samsung', 'company_toshiba',
       'company_vero'],
      dtype='object')

In [110]:
# Lista de columnas booleanas
columns_to_convert = ['company_acer', 'company_apple', 'company_asus', 'company_chuwi',
                       'company_dell', 'company_fujitsu', 'company_hp', 'company_huawei',
                       'company_lenovo', 'company_lg', 'company_mediacom', 'company_msi',
                       'company_razer', 'company_samsung', 'company_toshiba', 'company_vero']

# Convertir valores booleanos a 0 y 1 en las columnas seleccionadas
X_pred[columns_to_convert] = X_pred[columns_to_convert].astype(int)


### Guardar DataFrame de Test - 'X_pred' con la limpieza igual que 'Train'

In [113]:
X_pred.to_csv('./data/df_limpieza_Test_Xime.csv')

In [111]:
X_pred

Unnamed: 0,inches,ram,weight,tn_2_in_1_convertible,tn_gaming,tn_netbook,tn_notebook,tn_ultrabook,tn_workstation,scr_4k_ultra_hd,...,company_hp,company_huawei,company_lenovo,company_lg,company_mediacom,company_msi,company_razer,company_samsung,company_toshiba,company_vero
0,14.0,8,1.25,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,14.0,4,1.40,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,14.0,2,1.40,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,15.6,8,2.65,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11.6,2,1.10,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,15.6,16,3.49,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
387,12.5,8,1.25,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
388,15.6,4,2.30,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
389,15.6,4,2.65,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
# Lista completa de columnas esperadas por el modelo
all_expected_columns = ['inches', 'ram', 'weight', 'tn_2_in_1_convertible', 'tn_gaming', 'tn_netbook',
                         'tn_notebook', 'tn_ultrabook', 'tn_workstation', 'scr_4k_ultra_hd', 'scr_full_hd',
                         'scr_hd', 'scr_other', 'scr_quad_hd+', 'cpu_ghz', 'cpu_intel', 'cpu_amd', 'so_windows',
                         'so_no', 'so_linux', 'so_chrome', 'so_mac', 'gpu_intel', 'gpu_nvidia', 'gpu_amd',
                         'memory_gb', 'company_acer', 'company_apple', 'company_asus', 'company_chuwi',
                         'company_dell', 'company_fujitsu', 'company_google', 'company_hp', 'company_huawei',
                         'company_lenovo', 'company_lg', 'company_mediacom', 'company_microsoft', 'company_msi',
                         'company_razer', 'company_samsung', 'company_toshiba', 'company_vero', 'company_xiaomi']

# Asegúrate de que X_pred tenga todas las columnas esperadas
for column in all_expected_columns:
    if column not in X_pred.columns:
        # Si falta la columna, agrégala con un valor predeterminado
        X_pred[column] = False  # O puedes usar 0 para columnas numéricas




In [119]:
# Ordena las columnas en X_pred para que coincidan con el orden del conjunto de entrenamiento
X_pred = X_pred[X_train.columns]

# Ahora puedes realizar predicciones
pred = model.predict(X_pred)


## PREDICCIONES CON 'X_pred'

In [150]:
predictions_submit = model.predict(X_pred)
predictions_submit

array([1547.46066667,  393.3654    ,  242.893     ,  922.07333333,
        380.102     ,  540.99652   ,  534.98371667, 1381.1101    ,
       1509.5448    , 1182.18803333,  770.82113333, 1055.2153    ,
       1093.4371    ,  711.0508    ,  577.45273333, 1656.5336    ,
       1575.68566667, 2421.8934    ,  643.5948    ,  728.17      ,
       1169.407     ,  970.42916667,  559.16413333,  741.1806    ,
        320.828125  , 2028.4119    ,  732.50605   ,  785.7975    ,
        442.07423333, 1182.18803333,  996.018     , 1172.25236667,
        655.7161    ,  907.509     , 1981.768     , 1427.75      ,
       1084.9432    , 1331.4052    ,  438.355     , 1680.8788    ,
       2045.4707    ,  949.9901    ,  324.584     , 1819.4206    ,
       1038.98475   , 2070.38670736,  911.3343    , 1897.58754762,
       1036.3716    , 1513.3992    ,  887.48978333,  623.819     ,
        722.1774    , 1033.7628    ,  477.0667    ,  729.1604    ,
        990.24703333,  412.819     , 1532.22783333, 1165.5343 

In [151]:
sample = pd.read_csv("data/sample_submission.csv")

In [152]:
sample.head()

Unnamed: 0,laptop_ID,Price_euros
0,750,500
1,726,500
2,633,500
3,363,650
4,319,650


In [153]:
submission = pd.DataFrame({"laptop_ID": sample['laptop_ID'], "Price_euros": predictions_submit})

In [154]:
submission.head()

Unnamed: 0,laptop_ID,Price_euros
0,750,1547.460667
1,726,393.3654
2,633,242.893
3,363,922.073333
4,319,380.102


In [155]:
submission.shape

(391, 2)

In [156]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.laptop_ID.all() == sample.laptop_ID.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

In [157]:
chequeator(submission)

You're ready to submit!
