In [2187]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import regex as re
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [2188]:
train = pd.read_csv('Price_euros_train.csv')
test = pd.read_csv('Price_euros_test.csv')


train.head()


Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
1,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
2,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6
3,6,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.1kg,400.0
4,7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,2139.97


### Отчистка и обработка данных

In [2189]:
train.isnull().sum()

laptop_ID            0
Company              0
Product              0
TypeName             0
Inches               0
ScreenResolution    11
Cpu                  0
Ram                  0
Memory               0
Gpu                  0
OpSys               53
Weight              22
Price_euros          0
dtype: int64

In [2190]:
test.isnull().sum()

laptop_ID            0
Company              0
Product              0
TypeName             0
Inches               0
ScreenResolution     2
Cpu                  0
Ram                  0
Memory               0
Gpu                  0
OpSys               12
Weight               4
dtype: int64

In [2191]:
def delete_laptop_id(df):
    new_df = df.copy()
    new_df = df.drop('laptop_id', axis=1)
    return new_df

In [2192]:
def clear_ram(df):
    new_df = df.copy()
    new_df['Ram'] = new_df['Ram'].str.replace('GB', '')
    new_df['Ram'] = new_df['Ram'].astype('int32')
    return new_df

In [2193]:
def clear_weight(_df):
    new_df = _df.copy()
    new_df['Weight'] = new_df['Weight'].str.replace('kg', '')
    new_df['Weight'] = new_df['Weight'].astype('float32')
    new_df['Weight'].fillna(train['Weight'].mean(), inplace=True)
    return new_df

In [2194]:
from numpy import mean


def get_mean_screen_size_form_train():
    new_df = train.copy()
    new_df['ScreenResolutionArray'] = new_df['ScreenResolution'].str.split().str[-1].str.split('x')
    new_df['screen_width'] = new_df['ScreenResolutionArray'].str[0].astype('float')
    new_df['screen_height'] = new_df['ScreenResolutionArray'].str[1].astype('float')
    return new_df['screen_width'].mean(), new_df['screen_height'].mean()

def add_screen_width_and_height(_df):
    mean_screen_width, mean_screen_height = get_mean_screen_size_form_train()
    
    new_df = _df.copy()
    new_df['ScreenResolutionArray'] = new_df['ScreenResolution'].str.split().str[-1].str.split('x')
    new_df['screen_width'] = new_df['ScreenResolutionArray'].str[0].astype('float')
    new_df['screen_height'] = new_df['ScreenResolutionArray'].str[1].astype('float')
    new_df['screen_width'] = new_df['screen_width'].fillna(mean_screen_width)
    new_df['screen_height'] = new_df['screen_height'].fillna(mean_screen_height)
    # fill empty screenResolution with mean empty string
    new_df['ScreenResolution'] = new_df['ScreenResolution'].fillna(' ')
    return new_df

In [2195]:
def is_touchscreen(x):
        if type(x) != str:
            return 0
        if "Touchscreen" in x:
            return 1
        else:
            return 0
        
def add_touchscreen(df):
    new_df = df.copy()
    new_df["Touchscreen"] = new_df["ScreenResolution"].apply(is_touchscreen)
    return new_df

In [2196]:
def is_ips(x):
        if type(x) != str:
            return 0
        if "IPS" in x:
            return 1
        else:
            return 0

def add_ips(df):
    new_df = df.copy()
    new_df["IPS"] = new_df["ScreenResolution"].apply(is_ips)
    return new_df

In [2197]:
def add_cpu_features(df):
    freq_reg = r"([\d\.]+)(?:GHz)?$"
    
    new_df = df.copy()
    cpu_df = new_df.Cpu.str.split(freq_reg, expand=True).iloc[:, 0:2]
    new_df[["CpuType", "CpuFrequency"]] = cpu_df
    new_df = new_df.astype({"CpuFrequency": "float64"})
    return new_df

In [2198]:
ssd_re = re.compile(r"(\d+\.?\d*)GB\s+(SSD|Flash Storage)")
hdd_re_gb = re.compile(r"(\d+\.?\d*)GB\s+(HDD|Hybrid)")
hdd_re_tb = re.compile(r"(\d+\.?\d*)TB\s+(HDD|Hybrid)")


def extract_memory_size_ssd(memory_str):
    ssd_capacities = []
    for match in ssd_re.finditer(memory_str):
        ssd_capacities.append(int(match.group(1).split(".")[0]))
    size = sum(ssd_capacities)
    return size


def extract_memory_size_hdd(memory_str):
    hdd_gb_capacities = []
    hdd_tb_capacities = []
    for match in hdd_re_gb.finditer(memory_str):
        hdd_gb_capacities.append(int(match.group(1).split(".")[0]))
    for match in hdd_re_tb.finditer(memory_str):
        hdd_tb_capacities.append(int(match.group(1).split(".")[0]))
    size = sum(hdd_gb_capacities) + sum(hdd_tb_capacities) * 1024
    return size

def add_memory_features(df):
    new_df = df.copy()
    new_df["SSD_capacity"] = new_df["Memory"].apply(extract_memory_size_ssd)
    new_df["HDD_capacity"] = new_df["Memory"].apply(extract_memory_size_hdd)
    return new_df
    
    

In [2199]:
def add_ppi(df):
    new_df = df.copy()
    new_df["ppi"] = (
        ((new_df["screen_width"] ** 2) + (new_df["screen_height"] ** 2)) ** 0.5
    ) / new_df["Inches"].astype("float")
    return new_df

In [2200]:
def add_is_gaming_flag(df):
    new_df = df.copy()
    new_df["is_gaming"] = new_df['Gpu'].apply(lambda x: 1 if 'GTX' in x or 'GeForce' in x else 0)
    return new_df

In [2201]:
def add_is_ssd_flag(df):
    new_df = df.copy()
    new_df["is_ssd"] = new_df['SSD_capacity'].apply(lambda x: 1 if x > 0 else 0)
    return new_df

In [2202]:
def add_gpu_brand(df):
    new_df = df.copy()
    new_df['gpu_brand'] = new_df['Gpu'].apply(lambda x: x.split(' ')[0])
    new_df = new_df[new_df['gpu_brand'] != 'ARM']
    return new_df

In [2203]:
def add_cpu_brand(df):
    new_df = df.copy()
    new_df['cpu_brand'] = new_df['Cpu'].apply(lambda x: x.split(' ')[0])
    return new_df

In [2204]:
def get_os(os):
    if type(os) != str:
        return 'other'
    if 'Windows' in os or 'windows' in os:
        return 'Windows'
    elif 'Linux' in os or 'linux' in os:
        return 'Linux'
    elif 'Mac' in os or 'mac' in os:
        return 'Mac'
    else:
        return 'other'

def clear_os(df):
    new_df = df.copy()
    new_df['os'] = new_df['OpSys'].apply(get_os)
    return new_df

In [2205]:
def add_weight_inches_ratio(df):
    new_df = df.copy()
    new_df['weight_inches_ratio'] = new_df['Weight'] / new_df['Inches']
    return new_df

In [2206]:
def preprocess_data(df):
    new_df = df.copy()
    new_df = clear_ram(new_df)
    new_df = clear_weight(new_df)
    new_df = add_screen_width_and_height(new_df)
    new_df = add_touchscreen(new_df)
    new_df = add_ips(new_df)
    new_df = add_cpu_features(new_df)
    new_df = add_memory_features(new_df)
    new_df = add_ppi(new_df)
    new_df = add_is_gaming_flag(new_df)
    new_df = add_is_ssd_flag(new_df)
    new_df = add_gpu_brand(new_df)
    new_df = add_cpu_brand(new_df)
    new_df = clear_os(new_df)
    new_df = add_weight_inches_ratio(new_df)
    return new_df

### Анализ данных

In [2207]:
data = preprocess_data(train)

data.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,...,CpuFrequency,SSD_capacity,HDD_capacity,ppi,is_gaming,is_ssd,gpu_brand,cpu_brand,os,weight_inches_ratio
0,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,1.8,128,0,127.67794,0,1,Intel,Intel,Mac,0.100752
1,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,...,2.7,512,0,220.534624,0,1,AMD,Intel,Mac,0.118831
2,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,3.1,256,0,226.983005,0,1,Intel,Intel,Mac,0.103008
3,6,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4,500GB HDD,AMD Radeon R5,...,3.0,0,500,100.45467,0,0,AMD,AMD,Windows,0.134615
4,7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16,256GB Flash Storage,Intel Iris Pro Graphics,...,2.2,256,0,220.534624,0,1,Intel,Intel,Mac,0.132468


In [2208]:
# Выделим фичи из датасета и поделим их на числовые и категориальные.

num_cols = [
    # "Inches",
    "Ram",
    # "Weight",
    # "screen_width",
    # "screen_height",
    "CpuFrequency",
    "SSD_capacity",
    "HDD_capacity",
    "ppi",
    "is_ssd",
    "is_gaming",
    "Touchscreen",
    "IPS",
    "weight_inches_ratio"
]

# cat_cols = ["Company", "TypeName", "os", "gpu_brand", "cpu_brand"]
cat_cols = ["TypeName", "os", "gpu_brand", "cpu_brand"]

target_col = "Price_euros"

cols = num_cols + cat_cols + [target_col]

In [2209]:
# Посмотрим на распределение категориальных признаков

for col in cat_cols:
    print(f"{col} DISTRIBUTION")
    print(data[col].value_counts())
    print()

TypeName DISTRIBUTION
Notebook              627
Gaming                170
Ultrabook             166
2 in 1 Convertible     99
Workstation            25
Netbook                20
Name: TypeName, dtype: int64

os DISTRIBUTION
Windows    905
other      134
Linux       53
Mac         15
Name: os, dtype: int64

gpu_brand DISTRIBUTION
Intel     611
Nvidia    340
AMD       156
Name: gpu_brand, dtype: int64

cpu_brand DISTRIBUTION
Intel    1054
AMD        53
Name: cpu_brand, dtype: int64



In [2210]:
# посмотрим на корреляции между фичами

data.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

  data.corr().style.background_gradient(cmap='coolwarm').set_precision(2)


Unnamed: 0,laptop_ID,Inches,Ram,Weight,Price_euros,screen_width,screen_height,Touchscreen,IPS,CpuFrequency,SSD_capacity,HDD_capacity,ppi,is_gaming,is_ssd,weight_inches_ratio
laptop_ID,1.0,-0.1,-0.04,-0.03,0.07,-0.04,-0.05,0.06,-0.03,0.08,-0.05,-0.02,0.0,-0.07,-0.06,-0.0
Inches,-0.1,1.0,0.24,0.83,0.05,-0.08,-0.1,-0.35,-0.12,0.28,-0.18,0.55,-0.42,0.46,-0.28,0.72
Ram,-0.04,0.24,1.0,0.38,0.73,0.42,0.41,0.1,0.21,0.37,0.45,0.12,0.29,0.4,0.24,0.38
Weight,-0.03,0.83,0.38,1.0,0.18,-0.04,-0.06,-0.28,0.02,0.3,-0.14,0.53,-0.33,0.53,-0.18,0.98
Price_euros,0.07,0.05,0.73,0.18,1.0,0.56,0.55,0.2,0.25,0.43,0.56,-0.1,0.48,0.23,0.36,0.18
screen_width,-0.04,-0.08,0.42,-0.04,0.56,1.0,0.99,0.36,0.27,0.18,0.42,-0.12,0.93,0.16,0.29,-0.05
screen_height,-0.05,-0.1,0.41,-0.06,0.55,0.99,1.0,0.37,0.28,0.16,0.42,-0.13,0.94,0.14,0.29,-0.06
Touchscreen,0.06,-0.35,0.1,-0.28,0.2,0.36,0.37,1.0,0.16,-0.07,0.22,-0.22,0.46,-0.15,0.15,-0.26
IPS,-0.03,-0.12,0.21,0.02,0.25,0.27,0.28,0.16,1.0,0.06,0.23,-0.09,0.29,0.13,0.22,0.04
CpuFrequency,0.08,0.28,0.37,0.3,0.43,0.18,0.16,-0.07,0.06,1.0,0.21,0.13,0.04,0.31,0.06,0.29


### Работа с категориальными признаками

In [2211]:
def get_cat_data(data, cat_cols):
    cat_data_encoded = pd.get_dummies(data, columns=cat_cols)
    cat_data_encoded.drop(columns=["Company", "Memory", "Gpu", "Cpu", "OpSys", "laptop_ID", "Product", "Inches", "ScreenResolution", "Ram", "Weight", "ScreenResolutionArray", "screen_width", "screen_height", "Touchscreen", "IPS", "CpuType", "CpuFrequency", "SSD_capacity", "HDD_capacity", "ppi", "is_gaming", "is_ssd", "weight_inches_ratio"], inplace=True)
    return cat_data_encoded

In [2212]:
cat_data_encoded = get_cat_data(data, cat_cols)
cat_data_encoded.drop(columns=["Price_euros"], inplace=True)


cat_data_encoded.head(5)

Unnamed: 0,TypeName_2 in 1 Convertible,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,os_Linux,os_Mac,os_Windows,os_other,gpu_brand_AMD,gpu_brand_Intel,gpu_brand_Nvidia,cpu_brand_AMD,cpu_brand_Intel
0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1
1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1
2,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1
3,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0
4,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1


### Масштабирование данных

In [2213]:
num_and_cat_data = pd.concat([data[num_cols], cat_data_encoded], axis=1)

In [2214]:
pca = StandardScaler()
pca.fit(num_and_cat_data)
# Выход pca - numpy матрица, положим ее в новую переменную со всеми фичами
X = pca.transform(num_and_cat_data)
Y = data[target_col]

print(X.shape)

(1107, 25)


In [2215]:
# pca = StandardScaler()
# pca.fit(data[num_cols])
# # Выход pca - numpy матрица, положим ее в новую переменную со всеми фичами
# X = pca.transform(data[num_cols])
# Y = data[target_col]

# print(X.shape)

### Разделение на train/test

In [2216]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.15, random_state=5
)

### Обучение моделей

In [2217]:
def print_metrics(y_preds, y):
    print(f'R^2: {r2_score(y_preds, y)}')
    print(f'MSE: {mean_squared_error(y_preds, y)}')

In [2218]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print_metrics(lr.predict(X_test), y_test)

R^2: 0.8815960041858666
MSE: 80009.30436825524


In [2219]:
# Используем линейную регрессию, минимизирующую сумму квадратов ошибки
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

print_metrics(knn.predict(X_test), y_test)

R^2: 0.9109511089819917
MSE: 51601.317759999976


### Test csv

In [2220]:
data_test = preprocess_data(test)

data_test.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,...,CpuFrequency,SSD_capacity,HDD_capacity,ppi,is_gaming,is_ssd,gpu_brand,cpu_brand,os,weight_inches_ratio
0,86,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4,1TB HDD,AMD Radeon R5,...,3.0,0,1024,100.45467,0,0,AMD,AMD,Windows,0.134615
1,1249,Razer,Blade Pro,Gaming,14.0,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16,1TB SSD,Nvidia GeForce GTX 1060,...,2.8,0,0,157.350512,1,0,Nvidia,Intel,Windows,0.139286
2,122,Asus,VivoBook S15,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8,256GB SSD,Nvidia GeForce 940MX,...,1.8,256,0,141.211998,1,1,Nvidia,Intel,Windows,0.108974
3,993,Asus,Rog GL753VE-DS74,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,...,2.8,256,1024,127.335675,1,1,Nvidia,Intel,Windows,0.172832
4,739,Dell,Inspiron 3567,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,12,1TB HDD,Intel HD Graphics 620,...,2.5,0,1024,100.45467,0,0,Intel,Intel,Windows,0.144231


### Предикт

In [2221]:
# num + cat fetures
cat_data_encoded = get_cat_data(data_test, cat_cols)
num_and_cat_data_train = pd.concat([data_test[num_cols], cat_data_encoded], axis=1)

predict = lr.predict(pca.transform(num_and_cat_data_train))

submit = pd.DataFrame({'Price_euros': predict})
submit['Index'] = submit.index
submit = submit[['Index', 'Price_euros']]
submit.head()


Unnamed: 0,Index,Price_euros
0,0,386.577972
1,1,1731.486218
2,2,817.249391
3,3,1790.090169
4,4,1104.433405


In [2222]:
# predict = knn.predict(pca.transform(data_test[num_cols]))

# submit = pd.DataFrame({'Price_euros': predict})
# submit['Index'] = submit.index
# submit = submit[['Index', 'Price_euros']]
# submit.head()

In [2223]:
submit.to_csv('submission.csv', index=False)