In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler

df_test = pd.read_csv('Price_euros_test.csv')
df_train = pd.read_csv('Price_euros_train.csv')

In [2]:
df_train.head(3)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
1,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
2,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [3]:
df_train_nan = df_train.loc[:, df_train.isnull().any()]
df_train_nan.isnull().sum() / len(df_train) * 100

ScreenResolution    0.992780
OpSys               4.783394
Weight              1.985560
dtype: float64

In [4]:
df_train['OpSys'].fillna(df_train['OpSys'].mode()[0], inplace=True)
df_train['ScreenResolution'] = df_train['ScreenResolution'].fillna(method='ffill')
df_train['Weight'].fillna(df_train['Weight'].mode()[0], inplace=True)

df_train.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['OpSys'].fillna(df_train['OpSys'].mode()[0], inplace=True)
  df_train['ScreenResolution'] = df_train['ScreenResolution'].fillna(method='ffill')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Weight'].fillna(df_train['Weight'].mode()[0], inplace=True)


laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

In [5]:
df_train['Weight'] = df_train['Weight'].str.replace('kg', '').astype(float)
df_train['Ram'] = df_train['Ram'].str.replace('GB', '').astype(int)
df_train[['Width', 'Height']] = df_train['ScreenResolution'].str.extract(r'(\d+)x(\d+)').astype(int)
df_train['Cpu'] = df_train['Cpu'].str.extract(r'(\d+\.?\d*)GHz').astype(float)

df_train.drop(['ScreenResolution', 'laptop_ID'], axis=1, inplace=True)

df_train.head(3)

Unnamed: 0,Company,Product,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Width,Height
0,Apple,Macbook Air,Ultrabook,13.3,1.8,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,898.94,1440,900
1,Apple,MacBook Pro,Ultrabook,15.4,2.7,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,2537.45,2880,1800
2,Apple,MacBook Pro,Ultrabook,13.3,3.1,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,2560,1600


In [6]:
def process_memory(memory):
    memory_value = memory.split(' ')[0]
    num = re.search(r'\d+', memory_value).group()
    if 'TB' in memory_value:
        return int(num) * 1024
    elif 'GB' in memory_value:
        return int(num)
    else:
        return 0

df_train['Memory'] = df_train['Memory'].apply(process_memory)
df_train.head(3)


Unnamed: 0,Company,Product,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Width,Height
0,Apple,Macbook Air,Ultrabook,13.3,1.8,8,128,Intel HD Graphics 6000,macOS,1.34,898.94,1440,900
1,Apple,MacBook Pro,Ultrabook,15.4,2.7,16,512,AMD Radeon Pro 455,macOS,1.83,2537.45,2880,1800
2,Apple,MacBook Pro,Ultrabook,13.3,3.1,8,256,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,2560,1600


In [7]:
df_test.head(3)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,86,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,1TB HDD,AMD Radeon R5,Windows 10,2.1kg
1,1249,Razer,Blade Pro,Gaming,14.0,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,1TB SSD,Nvidia GeForce GTX 1060,Windows 10,1.95kg
2,122,Asus,VivoBook S15,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,256GB SSD,Nvidia GeForce 940MX,Windows 10,1.7kg


In [8]:
df_test_nan = df_test.loc[:, df_test.isnull().any()]
df_test_nan.isnull().sum() / len(df_test) * 100

ScreenResolution    1.025641
OpSys               6.153846
Weight              2.051282
dtype: float64

In [9]:
df_test['OpSys'].fillna(df_test['OpSys'].mode()[0], inplace=True)
df_test['ScreenResolution'] = df_test['ScreenResolution'].fillna(method='ffill')
df_test['Weight'].fillna(df_test['Weight'].mode()[0], inplace=True)

df_test.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['OpSys'].fillna(df_test['OpSys'].mode()[0], inplace=True)
  df_test['ScreenResolution'] = df_test['ScreenResolution'].fillna(method='ffill')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Weight'].fillna(df_test['Weight'].mode()[0], inplace=True)


laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
dtype: int64

In [10]:
df_test['Weight'] = df_test['Weight'].str.replace('kg', '').astype(float)
df_test['Ram'] = df_test['Ram'].str.replace('GB', '').astype(int)
df_test[['Width', 'Height']] = df_test['ScreenResolution'].str.extract(r'(\d+)x(\d+)').astype(int)
df_test['Cpu'] = df_test['Cpu'].str.extract(r'(\d+\.?\d*)GHz').astype(float)

df_test.drop(['ScreenResolution', 'laptop_ID'], axis=1, inplace=True)

df_test['Memory'] = df_test['Memory'].apply(process_memory)

df_test.head(3)


Unnamed: 0,Company,Product,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Width,Height
0,Acer,Aspire 3,Notebook,15.6,3.0,4,1024,AMD Radeon R5,Windows 10,2.1,1366,768
1,Razer,Blade Pro,Gaming,14.0,2.8,16,1024,Nvidia GeForce GTX 1060,Windows 10,1.95,1920,1080
2,Asus,VivoBook S15,Notebook,15.6,1.8,8,256,Nvidia GeForce 940MX,Windows 10,1.7,1920,1080


In [11]:
class LinearRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights = None
        self.bias = None

    def fit(self, X, Y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.iterations):
            self.update_weights(X, Y, n_samples)

    def update_weights(self, X, Y, n_samples):
        Y_predicted = np.dot(X, self.weights) + self.bias
        dw = (1 / n_samples) * np.dot(X.T, (Y_predicted - Y))
        db = (1 / n_samples) * np.sum(Y_predicted - Y)

        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [12]:
df_numeric_train = df_train.select_dtypes(include=np.number)
df_numeric_test = df_test.select_dtypes(include=np.number)

In [13]:
X_train = df_numeric_train.drop('Price_euros', axis=1)
y_train = df_numeric_train['Price_euros']

In [14]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_train_std = pd.DataFrame(X_train_std, columns=X_train.columns)

df_numeric_test_std = scaler.fit_transform(df_numeric_test)
X_test = df_numeric_test_std

In [15]:
model = LinearRegression(iterations = 1000, learning_rate = 0.01)
model.fit(X_train_std, y_train)

y_test = model.predict(X_test)

In [16]:
output = pd.DataFrame({
    'index': range(len(y_test)),
    'Price_euros': y_test
})
output.to_csv('output.csv', index=False)