In [1]:
import pandas as pd
import numpy as np

### Get the version information

In [2]:
pd.__version__

'2.2.2'

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

--2024-09-19 12:42:51--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 298573 (292K) [text/plain]
Saving to: ‘laptops.csv.3’


2024-09-19 12:42:51 (4.89 MB/s) - ‘laptops.csv.3’ saved [298573/298573]



In [4]:
df = pd.read_csv('laptops.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Laptop        2160 non-null   object 
 1   Status        2160 non-null   object 
 2   Brand         2160 non-null   object 
 3   Model         2160 non-null   object 
 4   CPU           2160 non-null   object 
 5   RAM           2160 non-null   int64  
 6   Storage       2160 non-null   int64  
 7   Storage type  2118 non-null   object 
 8   GPU           789 non-null    object 
 9   Screen        2156 non-null   float64
 10  Touch         2160 non-null   object 
 11  Final Price   2160 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 202.6+ KB


In [5]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


### Laptop brands in the dataset

In [6]:
df['Brand'].unique()

array(['Asus', 'Alurin', 'MSI', 'HP', 'Lenovo', 'Medion', 'Acer', 'Apple',
       'Razer', 'Gigabyte', 'Dell', 'LG', 'Samsung', 'PcCom', 'Microsoft',
       'Primux', 'Prixton', 'Dynabook Toshiba', 'Thomson', 'Denver',
       'Deep Gaming', 'Vant', 'Innjoo', 'Jetwing', 'Millenium', 'Realme',
       'Toshiba'], dtype=object)

In [7]:
df['Brand'].nunique()

27

### Columns with NA values

In [8]:
df.columns[df.isna().any()]

Index(['Storage type', 'GPU', 'Screen'], dtype='object')

### Maximum final price of Dell notebooks in the dataset

In [9]:
df.loc[df['Brand']=='Dell', 'Final Price'].nlargest()

1335    3936.00
1334    3469.00
1346    3012.09
1347    2818.09
1323    2450.71
Name: Final Price, dtype: float64

### Most common screen size in the dataset


1. Find the median value of Screen column in the dataset.
2. calculate the most frequent value of the same Screen column.
3. Use fillna method to fill the missing values in Screen column with the most frequent value from the previous step.
4. calculate the median value of Screen once again

In [10]:
df['Screen'].value_counts().head()

Screen
15.6    1009
14.0     392
16.0     174
17.3     161
13.3     131
Name: count, dtype: int64

In [11]:
df['Screen'].median()

15.6

In [12]:
screen_mode = df['Screen'].mode().squeeze()
screen_mode

15.6

In [13]:
df['Screen'].fillna(screen_mode).median()

15.6

### A starter on linear regression using normal equation


step 1. Select a subset of data 

- Innjoo laptops 

- and 3 numerial columns only (RAM, Storage, Screen)

- no categorical columns yet


In [14]:
# search "Innjoo" in the dataset
innjoo = pd.DataFrame()
for col in df.select_dtypes(object).columns:
    contain_innjoo = df.loc[df[col].str.contains('Innjoo', case=False, na=False), col]
    innjoo = pd.concat([innjoo, contain_innjoo], axis=1).dropna(how='all', axis=1)
innjoo


Unnamed: 0,Laptop,Brand
1478,InnJoo Voom Excellence Intel Celeron N4020/8GB...,Innjoo
1479,InnJoo Voom Excellence Pro Intel Celeron N4020...,Innjoo
1480,Innjoo Voom Intel Celeron N3350/4GB/64GB eMMC/...,Innjoo
1481,Innjoo Voom Laptop Max Intel Celeron N3350/6GB...,Innjoo
1482,Innjoo Voom Laptop Pro Intel Celeron N3350/6GB...,Innjoo
1483,Innjoo Voom Pro Intel Celeron N3350/6GB/128GB ...,Innjoo


In [15]:
X = df.loc[df['Brand']=='Innjoo', ['RAM','Storage','Screen']]
X


Unnamed: 0,RAM,Storage,Screen
1478,8,256,15.6
1479,8,512,15.6
1480,4,64,14.1
1481,6,64,14.1
1482,6,128,14.1
1483,6,128,14.1


In [16]:
X = X.to_numpy()
X

array([[  8. , 256. ,  15.6],
       [  8. , 512. ,  15.6],
       [  4. ,  64. ,  14.1],
       [  6. ,  64. ,  14.1],
       [  6. , 128. ,  14.1],
       [  6. , 128. ,  14.1]])

step2: perform linear regression using normal equation

In [17]:
XTX = X.T @ X
XTX

array([[2.52000e+02, 8.32000e+03, 5.59800e+02],
       [8.32000e+03, 3.68640e+05, 1.73952e+04],
       [5.59800e+02, 1.73952e+04, 1.28196e+03]])

In [18]:
XTX_inv = np.linalg.inv(XTX)
XTX_inv

array([[ 2.78025381e-01, -1.51791334e-03, -1.00809855e-01],
       [-1.51791334e-03,  1.58286725e-05,  4.48052175e-04],
       [-1.00809855e-01,  4.48052175e-04,  3.87214888e-02]])

In [19]:
y = np.array([1100, 1300, 800, 900, 1000, 1100])
y

array([1100, 1300,  800,  900, 1000, 1100])

In [20]:
w = np.dot(XTX_inv @ X.T, y)
w

array([45.58076606,  0.42783519, 45.29127938])

In [21]:
w.sum()

91.2998806299557