# Homework Week 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The data we used here from:
- https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

In [4]:
df = pd.read_csv('data/laptops.csv')
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [5]:
# Preparing the dataset
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [6]:
df.final_price

0       1009.00
1        299.00
2        789.00
3       1199.00
4        669.01
         ...   
2155    2699.99
2156    2899.99
2157    3399.99
2158    1899.99
2159    1699.99
Name: final_price, Length: 2160, dtype: float64

The columns used are:
- `ram`
- `storage`
- `screen`
- `final_price`

In [8]:
df_used = df[['ram', 'storage', 'screen', 'final_price']]
df_used.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


## Q1: There's one columns with missing value, what is it?

In [10]:
df_used.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

The only column that have a missing value is `screen`

## Q2: What's the median for `ram`

In [13]:
df_used.ram.median()

16.0

## Q3: Modelling 

In [15]:
def splitting_data(df):
    # Shuffling the data
    n = len(df)
    
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    # Splitting the dataset into train/val/test 60%/20%/20%
    n_test = int(n * 0.2)
    n_val = int(n * 0.2)
    n_train = n - n_val - n_test

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]    

    # Reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    return df_train, df_val, df_test

In [16]:
def train_lin_reg(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [17]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [18]:
def prepare_mean(df):
    df = df.fillna(df.mean())
    return df

def prepare_0(df):
    df = df.fillna(0)
    return df

In [19]:
np.random.seed(42)
df_mean = prepare_mean(df_used)
df_0 = prepare_0(df_used)

train_mean, val_mean, test_mean = splitting_data(df_mean)
train_0, val_0, test_0 = splitting_data(df_0)

In [20]:
# RMSE for filling with mean
w0, w = train_lin_reg(train_mean.drop('final_price', axis=1), train_mean['final_price'])

X_val = val_mean.drop('final_price', axis=1)
y_val = val_mean['final_price']
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred).round(2)

600.27

In [21]:
# RMSE for filling with 0
w0, w = train_lin_reg(train_0.drop('final_price', axis=1), train_0['final_price'])

X_val = val_0.drop('final_price', axis=1)
y_val = val_0['final_price']
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)

559.9658524480305

It's just slighlty different between using 0 and using mean. The data using 0 is lower than the data using mean.

## Q4: Using regularization

In [24]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [25]:
# checking the best regularization
r = [0, 0.01, 0.1, 1, 5, 10, 100]
X = train_0.drop('final_price', axis=1)
y = train_0['final_price']

for num in r:
    w0, w = train_linear_regression_reg(X, y, r=num)
    
    X_val = val_0.drop('final_price', axis=1)
    y_val = val_0['final_price']
    y_pred = w0 + X_val.dot(w)
    
    print(num, (rmse(y_val, y_pred)).round(2))

0 559.97
0.01 559.97
0.1 559.98
1 560.14
5 560.86
10 561.55
100 563.81


The lowest `RSME` is with `0` and `0.01` which the value is 559.97

## Q5: Randomize seed

In [27]:
seed = np.arange(0,10)

In [28]:
score = []
for num in seed:
    np.random.seed(num)
    df_0 = prepare_0(df_used)
    
    train_0, val_0, test_0 = splitting_data(df_0)

    w0, w = train_lin_reg(train_0.drop('final_price', axis=1), train_0['final_price'])

    X_val = val_0.drop('final_price', axis=1)
    y_val = val_0['final_price']
    y_pred = w0 + X_val.dot(w)
    
    score.append(rmse(y_val, y_pred))

In [29]:
std = np.std(score)
std.round(3)

29.176

The standard deviation for all of the scores is 29.176

## Q6: Case

In [32]:
np.random.seed(9)
df_0 = prepare_0(df_used)

train_0, val_0, test_0 = splitting_data(df_0)
df_combine = pd.concat([train_0, val_0])

X = df_combine.drop('final_price', axis=1)
y = df_combine['final_price']

w0, w = train_linear_regression_reg(X, y, r =0.001)

X_test = test_0.drop('final_price', axis=1)
y_test = test_0['final_price']
y_pred = w0 + X_test.dot(w)

rmse(y_test, y_pred)

608.6099822049559

So using seed 9, and combine the train and val dataset, also regularize with `0.001`, then test the `RMSE` using test dataset, The `RMSE` got is `608.6099`