In [1]:
import os
import pandas as pd

os.chdir(os.getcwd())

In [135]:
# Step 1: Get a general sense of the dataset
laptops = pd.read_csv("laptops.csv", encoding="Latin-1")
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [136]:
# Step 2: Relabel columns if needed
# Remove whitespace for the Storage column
new_columns = []
for column in laptops.columns:
    new_columns.append(column.strip())

laptops.columns = new_columns

In [137]:
# Since we have a variety of upper and lowercase letters in the column labels
# Replace spaces with underscores
# Remove special characters
# Making all labels lowercase
# Shortening any long column names


def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col


new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
print(laptops.columns)

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')


In [138]:
# Step 3: Convert string columns to numeric if needed
# Look at a few rows
laptops.head(10)

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360
5,Acer,Aspire 3,Notebook,"15.6""",1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows,10,2.1kg,40000
6,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS,X,2.04kg,213997
7,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,115870
8,Asus,ZenBook UX430UN,Ultrabook,"14.0""",Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,Windows,10,1.3kg,149500
9,Acer,Swift 3,Ultrabook,"14.0""",IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows,10,1.6kg,77000


In [139]:
# screen_size, ram, weight, price_euros often evaluated as numeric
laptops["weight"] = laptops["weight"].str.replace(r"[a-zA-Z]+$", "", regex=True)
laptops["screen_size"] = laptops["screen_size"].str.replace('"', "")
laptops["ram"] = laptops["ram"].str.replace(r"[a-zA-Z]+$", "", regex=True)

In [140]:
laptops["price_euros"] = laptops["price_euros"].str.replace(",", ".")

In [141]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34,898.94
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86,575.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83,2537.45
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37,1803.6


In [176]:
laptops["weight_kg"].value_counts()

weight_kg
2.20    126
2.10     58
2.00     45
2.40     44
2.30     41
       ... 
4.50      1
1.14      1
3.80      1
3.25      1
2.34      1
Name: count, Length: 171, dtype: int64

In [143]:
laptops[["weight", "screen_size", "price_euros"]] = laptops[
    ["weight", "screen_size", "price_euros"]
].astype(float)
laptops["ram"] = laptops["ram"].astype(int)

In [144]:
laptops.dtypes

manufacturer     object
model_name       object
category         object
screen_size     float64
screen           object
cpu              object
ram               int64
storage          object
gpu              object
os               object
os_version       object
weight          float64
price_euros     float64
dtype: object

In [145]:
laptops.rename(
    {"screen_size": "screen_size_inches", "ram": "ram_gb", "weight": "weight_kg"},
    axis=1,
    inplace=True,
)

In [146]:
laptops[["screen_size_inches", "ram_gb", "weight_kg", "price_euros"]].describe()

Unnamed: 0,screen_size_inches,ram_gb,weight_kg,price_euros
count,1303.0,1303.0,1303.0,1303.0
mean,15.017191,8.382195,2.038734,1123.686992
std,1.426304,5.084665,0.665475,699.009043
min,10.1,2.0,0.69,174.0
25%,14.0,4.0,1.5,599.0
50%,15.6,8.0,2.04,977.0
75%,15.6,8.0,2.3,1487.88
max,18.4,64.0,4.7,6099.0


In [147]:
# Step 4: extract values from strings such as the cpu and gpu column
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]

In [148]:
laptops["cpu_manufacturer"].value_counts()

cpu_manufacturer
Intel      1240
AMD          62
Samsung       1
Name: count, dtype: int64

In [149]:
laptops["gpu_manufacturer"].value_counts()

gpu_manufacturer
Intel     722
Nvidia    400
AMD       180
ARM         1
Name: count, dtype: int64

In [150]:
# Step 5: Correcting Bad Values
print(laptops["os"].value_counts())

os
Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: count, dtype: int64


In [151]:
# Two variations of the Apple operating system
mapping_dict = {
    "Android": "Android",
    "Chrome OS": "Chrome OS",
    "Linux": "Linux",
    "Mac OS": "macOS",
    "No OS": "No OS",
    "Windows": "Windows",
    "macOS": "macOS",
}
laptops["os"] = laptops["os"].map(mapping_dict)

In [152]:
# Step 6: Deal with missing values
laptops.isnull().sum()

manufacturer            0
model_name              0
category                0
screen_size_inches      0
screen                  0
cpu                     0
ram_gb                  0
storage                 0
gpu                     0
os                      0
os_version            170
weight_kg               0
price_euros             0
cpu_manufacturer        0
gpu_manufacturer        0
dtype: int64

In [153]:
laptops["os_version"].value_counts(dropna=False)

os_version
10      1072
NaN      170
7         45
X          8
10 S       8
Name: count, dtype: int64

In [154]:
os_with_null_v = laptops.loc[laptops["os_version"].isnull(), "os"]

In [155]:
os_with_null_v.value_counts()

os
No OS        66
Linux        62
Chrome OS    27
macOS        13
Android       2
Name: count, dtype: int64

The most frequent value is "No OS"; Thirteen of the laptops that come with macOS do not specify the version. os_version of macOS should be equal to X.

In [156]:
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

In [157]:
laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"

In [158]:
laptops.to_csv("laptops_cleaned.csv", index=False)

In [159]:
# Analysis Questions: Are laptops made by Apple more expensive than those made by other manufacturers?
m_price = {}
for m in laptops["manufacturer"].unique():
    avg_price = laptops.loc[laptops["manufacturer"] == m, "price_euros"].mean()
    m_price[m] = avg_price

In [160]:
sorted_m_price = sorted(m_price.items(), key=lambda x: x[1], reverse=True)

In [161]:
sorted_m_price

[('Razer', 3346.1428571428573),
 ('LG', 2099.0),
 ('MSI', 1728.9081481481483),
 ('Google', 1677.6666666666667),
 ('Microsoft', 1612.3083333333334),
 ('Apple', 1564.1985714285713),
 ('Huawei', 1424.0),
 ('Samsung', 1413.4444444444443),
 ('Toshiba', 1267.8125),
 ('Dell', 1186.06898989899),
 ('Xiaomi', 1133.4625),
 ('Asus', 1104.1693670886077),
 ('Lenovo', 1086.3844444444446),
 ('HP', 1067.7748540145985),
 ('Fujitsu', 729.0),
 ('Acer', 626.7758252427185),
 ('Chuwi', 314.2966666666667),
 ('Mediacom', 295.0),
 ('Vero', 217.425)]

Apple laptops are more expensive than many other brands like Dell, Asus, Lenovo, HP, Acer, etc.
However, there are brands like Razer, LG, MSI, Google, and Microsoft that, on average, offer laptops at a higher price point than Apple.
So, while Apple laptops are certainly among the more expensive brands on the market, they are not the most expensive when compared to all other manufacturers. It's important to note that averages can be influenced by the range of products offered - some brands might have a wider range of prices (from budget to high-end), while others might focus only on premium segments.

For a general-purpose laptop, you might consider:

- Processor (CPU) Performance: Speed and number of cores.
- Graphics Performance (GPU): Especially important for gaming or graphic-intensive tasks.
- RAM: More RAM typically means better multitasking and performance.
- Storage: SSDs offer faster performance than HDDs. Consider both size and type of storage.
- Battery Life: Important for portability and convenience.
- Others:
Build Quality and Design: Durability and aesthetics.
Additional Features: Such as keyboard quality, number of ports, webcam, etc.

In [200]:
# Q2: What is the best value laptop with a screen size of 15" or more?
import numpy as np

laptops_over15 = laptops[laptops["screen_size_inches"] >= 15]

# Define weights (based on personal needs)
weights = {"cpu": 0.1, "ram": 0.1, "storage": 0.1, "price": 0.5, "weight": 0.2}

# Score the features (assuming they are numeric)
# For simplicity,  ram 8,16,24 gb - score 1,2,3
# cpu Intel, AMD, Samsung - score 3,2,1
# storage HDD 3 SSD 2 others 1
# price 499, 899, 1299 - score 3,2,1
# weight 1, 2, 3 - score 3,2,1


bins = [2, 8, 16, np.inf]  # Defines the ranges: 0-8, 8-16, >16
labels = [1, 2, 3]  # Score for each bin
laptops_over15["ram_score"] = pd.cut(
    laptops_over15["ram_gb"], bins=bins, labels=labels, right=False
).astype(int)

cpu_map = {"Intel": 3, "AMD": 2}
laptops_over15["cpu_score"] = laptops_over15["cpu_manufacturer"].map(cpu_map).fillna(1)

storage_function = lambda x: 3 if "SSD" in x else (1 if "HDD" in x else 2)
laptops_over15["storage_score"] = laptops_over15["storage"].apply(storage_function)

price_bins = [0.0, 899.0, 1299.0, np.inf]
price_labels = [3, 2, 1]
laptops_over15["price_score"] = pd.cut(
    laptops_over15["price_euros"], bins=price_bins, labels=price_labels, right=False
).astype(int)

weight_bins = [1.0, 2.0, 3.0, np.inf]
weight_labels = [3, 2, 1]
laptops_over15["weight_score"] = pd.cut(
    laptops_over15["weight_kg"], bins=weight_bins, labels=weight_labels, right=False
).astype(int)


# Calculate the total score
weighted_cols = []
for feature, weight in weights.items():
    laptops_over15["weighted_" + feature] = laptops_over15[feature + "_score"] * weight
    weighted_cols.append("weighted_" + feature)
laptops_over15["total_score"] = laptops_over15[weighted_cols].sum(axis=1)

# Rank laptops based on value score
laptops_over15 = laptops_over15.sort_values(by="total_score", ascending=False)

# Display top 10 laptops
print(laptops_over15[["model_name", "os", "total_score"]].head(20))

                                   model_name       os  total_score
2                                      250 G6    No OS          2.9
1182                              Vostro 5568    Linux          2.9
450       15-bs024nv (i5-7200U/8GB/128GB/W10)  Windows          2.9
747                                    250 G6  Windows          2.9
86                        Pavilion 15-CK000nv  Windows          2.9
671              V310-15ISK (i5-7200U/8GB/1TB  Windows          2.9
797                             Latitude 3580  Windows          2.9
501     15-BS026nv (i5-7200U/8GB/256GB/Radeon  Windows          2.9
24    15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)  Windows          2.9
1133                                   250 G6  Windows          2.9
1170                                   250 G5  Windows          2.9
178              V310-15IKB (i5-7200U/8GB/1TB  Windows          2.9
100     15-bs017nv (i7-7500U/8GB/256GB/Radeon  Windows          2.9
64                                     250 G6  W

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  laptops_over15["ram_score"] = pd.cut(laptops_over15["ram_gb"], bins=bins, labels=labels, right=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  laptops_over15["cpu_score"] = laptops_over15["cpu_manufacturer"].map(cpu_map).fillna(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lap

In [201]:
laptops_over15.loc[1182]

manufacturer                                Dell
model_name                           Vostro 5568
category                                Notebook
screen_size_inches                          15.6
screen                         Full HD 1920x1080
cpu                   Intel Core i7 7500U 2.7GHz
ram_gb                                         8
storage                                256GB SSD
gpu                      Nvidia GeForce GT 940MX
os                                         Linux
os_version                                   NaN
weight_kg                                   1.98
price_euros                               895.01
cpu_manufacturer                           Intel
gpu_manufacturer                          Nvidia
ram_score                                      2
cpu_score                                      3
storage_score                                  3
price_score                                    3
weight_score                                   3
weighted_cpu        

As the customer weighted the laptop's price as the most important factor, weight the second.  Vostro 5568  with 1.98kg and 895.01 euros is one of the best-valued among the larger-than-15-inch laptops.

Ideal for users requiring a balance between performance and price, such as small business owners, students, or professionals who don't need a Windows-based system.

Good for tasks requiring moderate to high CPU and GPU performance, but might not suit users who need top-tier gaming or intensive graphics capabilities.

Customer Reviews could be combined to provide a more comprehensive evaluation of "best-values"