In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

import warnings
warnings.filterwarnings('ignore')



In [2]:
df_train = pd.read_csv('Price_euros_train.csv')

In [3]:
df_test = pd.read_csv('Price_euros_test.csv')

In [4]:
df_train.isnull().sum()

laptop_ID            0
Company              0
Product              0
TypeName             0
Inches               0
ScreenResolution    11
Cpu                  0
Ram                  0
Memory               0
Gpu                  0
OpSys               53
Weight              22
Price_euros          0
dtype: int64

In [None]:
df_test.isnull().sum()

laptop_ID            0
Company              0
Product              0
TypeName             0
Inches               0
ScreenResolution     2
Cpu                  0
Ram                  0
Memory               0
Gpu                  0
OpSys               12
Weight               4
dtype: int64

In [None]:
df_train.duplicated().sum()
df_test.duplicated().sum()

0

In [None]:
df_train.dtypes
#df_test.dtypes

laptop_ID             int64
Company              object
Product              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight               object
Price_euros         float64
dtype: object

In [None]:
df_train.nunique()

laptop_ID           1108
Company               18
Product              548
TypeName               6
Inches                16
ScreenResolution      36
Cpu                  112
Ram                    9
Memory                37
Gpu                  106
OpSys                  9
Weight               171
Price_euros          703
dtype: int64

In [None]:
df_train = df_train.dropna(subset=['OpSys'])
df_train = df_train.dropna(subset=['ScreenResolution'])
df_train = df_train.dropna(subset=['Weight'])
df_train = df_train.drop(['laptop_ID'], axis=1)
df_test = df_test.dropna(subset=['OpSys'])
df_test = df_test.dropna(subset=['ScreenResolution'])
df_test = df_test.dropna(subset=['Weight'])
df_test = df_test.drop(['laptop_ID'], axis=1)

In [None]:
df_test.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,1TB HDD,AMD Radeon R5,Windows 10,2.1kg
1,Razer,Blade Pro,Gaming,14.0,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,1TB SSD,Nvidia GeForce GTX 1060,Windows 10,1.95kg
2,Asus,VivoBook S15,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,256GB SSD,Nvidia GeForce 940MX,Windows 10,1.7kg
3,Asus,Rog GL753VE-DS74,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows 10,2.99kg
4,Dell,Inspiron 3567,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,12GB,1TB HDD,Intel HD Graphics 620,Windows 10,2.25kg


In [None]:
df_train.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
1,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
2,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6
3,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.1kg,400.0
4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,2139.97


In [5]:
df_train['Company'].value_counts()

Dell         255
Lenovo       251
HP           232
Asus         135
Acer          91
MSI           44
Toshiba       41
Apple         17
Samsung        9
Mediacom       6
Microsoft      5
Razer          4
Xiaomi         4
Vero           4
Google         3
Fujitsu        3
Chuwi          2
LG             2
Name: Company, dtype: int64

In [6]:
df_test['Company'].value_counts()

Lenovo       46
Dell         42
HP           42
Asus         23
Acer         12
MSI          10
Toshiba       7
Apple         4
Razer         3
Huawei        2
Mediacom      1
LG            1
Microsoft     1
Chuwi         1
Name: Company, dtype: int64

In [7]:

numerical_columns = ['Inches','Price_euros']
categorical_columns = ['Company', 'Product', 'TypeName','ScreenResolution','Cpu','Ram','Memory','Gpu','OpSys','Weight']
numerical_columns_test = ['Inches']

label_encoder = LabelEncoder()
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])
    df_test[column] = label_encoder.fit_transform(df_test[column])

for column in numerical_columns_test:
    df_test[f'{column}_squared'] = df_test[column] ** 2
    df_train[f'{column}_squared'] = df_train[column] ** 2

scaler = StandardScaler()
df_train[numerical_columns_test] = scaler.fit_transform(df_train[numerical_columns_test])
df_test[numerical_columns_test] = scaler.fit_transform(df_test[numerical_columns_test])
#df_test[categorical_columns] = scaler.fit_transform(df_test[categorical_columns])
#df_train[categorical_columns] = scaler.fit_transform(df_train[categorical_columns])
df_train.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Inches_squared
0,2,1,269,4,-1.221219,1,60,8,2,49,8,34,898.94,176.89
1,4,1,268,4,0.258515,24,81,1,29,8,8,68,2537.45,237.16
2,5,1,268,4,-1.221219,22,64,8,16,57,8,37,1803.6,176.89
3,6,0,51,3,0.399442,0,14,5,26,16,5,102,400.0,243.36
4,7,1,268,4,0.258515,24,80,1,15,58,3,87,2139.97,237.16


In [14]:
X_train = df_train.drop(['Price_euros'], axis=1)
y_train = df_train['Price_euros']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train,y_train)
X_test = df_test

model = LinearRegression()
model.fit(X_train, y_train)

w0 = model.intercept_
w1 = model.coef_

#Предсказание модели
y_pred_test = model.predict(X_test)

print(w0,len(w1))

      laptop_ID  Company  Product  TypeName    Inches  ScreenResolution  Cpu  \
0             2        1      269         4 -1.221219                 1   60   
1             4        1      268         4  0.258515                24   81   
2             5        1      268         4 -1.221219                22   64   
3             6        0       51         3  0.399442                 0   14   
4             7        1      268         4  0.258515                24   80   
...         ...      ...      ...       ...       ...               ...  ...   
1103       1315        2      505         3  0.399442                 0   85   
1104       1316        9      514         0 -0.727974                13   85   
1105       1318        9      176         3 -0.727974                 0   33   
1106       1319        7        2         3  0.399442                 0   85   
1107       1320        2      503         3  0.399442                 0   33   

      Ram  Memory  Gpu  OpSys  Weight  

In [None]:
#mse = mean_squared_error(y_train, y_pred)
#r2_square = r2_score(y_train,y_pred)
#mae= mean_absolute_error(y_train, y_pred)
#print(f" R-squared: {r2_square}")
#print(f'Mean Squared Error: {mse}')
#print(f'Mean Absolute Error: {mae}')

ValueError: Found input variables with inconsistent numbers of samples: [1026, 177]

In [15]:
y_pred_test = pd.DataFrame(y_pred_test, columns=['Price_euros'])
y_pred_test = y_pred_test.reset_index()

y_pred_test.to_csv("solution.csv", index=False)