Required libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

Import dataset

In [4]:
df = pd.read_csv('cpu.csv')

Copy the dataset and show the shape

In [5]:
dfc = df.copy()
dfc.shape

(3825, 12)

Get the first 10 rows

In [6]:
dfc.head(10)

Unnamed: 0,cpuName,price,cpuMark,cpuValue,threadMark,threadValue,TDP,powerPerf,cores,testDate,socket,category
0,AMD Ryzen Threadripper PRO 5995WX,,108822,,3330,,280.0,388.65,64,2022,sWRX8,Desktop
1,AMD EPYC 7763,7299.99,88338,12.1,2635,0.36,280.0,315.49,64,2021,SP3,Server
2,AMD EPYC 7J13,,86006,,2387,,,,64,2021,unknown,Server
3,AMD EPYC 7713,7060.0,85861,12.16,2727,0.39,225.0,381.6,64,2021,SP3,Server
4,AMD Ryzen Threadripper PRO 3995WX,6807.98,83971,12.33,2626,0.39,280.0,299.9,64,2020,sWRX8,Desktop
5,AMD Ryzen Threadripper 3990X,8399.69,81568,9.71,2569,0.31,280.0,291.31,64,2020,sTRX4,Desktop
6,AMD Ryzen Threadripper PRO 5975WX,,80842,,3340,,280.0,288.72,32,2022,sWRX8,Desktop
7,AMD EPYC 7B13,,77460,,2564,,,,60,2021,unknown,Server
8,AMD EPYC 7643,5424.99,76455,14.09,2695,0.5,225.0,339.8,48,2021,SP3,Server
9,AMD EPYC 7702,4000.0,71646,17.91,2097,0.52,200.0,358.23,64,2020,SP3,Server


Get the columns

In [7]:
dfc.columns

Index(['cpuName', 'price', 'cpuMark', 'cpuValue', 'threadMark', 'threadValue',
       'TDP', 'powerPerf', 'cores', 'testDate', 'socket', 'category'],
      dtype='object')

Get the dataset information

In [8]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cpuName      3825 non-null   object 
 1   price        1967 non-null   float64
 2   cpuMark      3825 non-null   int64  
 3   cpuValue     1967 non-null   float64
 4   threadMark   3825 non-null   int64  
 5   threadValue  1967 non-null   float64
 6   TDP          3140 non-null   float64
 7   powerPerf    3140 non-null   object 
 8   cores        3825 non-null   int64  
 9   testDate     3825 non-null   int64  
 10  socket       3825 non-null   object 
 11  category     3825 non-null   object 
dtypes: float64(4), int64(4), object(4)
memory usage: 358.7+ KB


Check for the missing values

In [9]:
dfc.isna().sum()

cpuName           0
price          1858
cpuMark           0
cpuValue       1858
threadMark        0
threadValue    1858
TDP             685
powerPerf       685
cores             0
testDate          0
socket            0
category          0
dtype: int64

Check all the columns that was greater than 10% of its data are missing

In [10]:
(nulled_10 := 
 dfc.loc[
     :, 
     dfc.drop(
        dfc.dropna(
            axis=1, 
            thresh=df.shape[0] * (1 - 0.25), 
            inplace=False), 
        axis=1)
        .columns].columns)

Index(['price', 'cpuValue', 'threadValue'], dtype='object')

Remove all the nulled row and nulled greater than 10% columns

In [11]:
for nulled in nulled_10:
    #remove the row if the value is null
    dfc.dropna(axis=0, subset=[nulled], inplace=True)

dfc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1967 entries, 1 to 3806
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cpuName      1967 non-null   object 
 1   price        1967 non-null   float64
 2   cpuMark      1967 non-null   int64  
 3   cpuValue     1967 non-null   float64
 4   threadMark   1967 non-null   int64  
 5   threadValue  1967 non-null   float64
 6   TDP          1938 non-null   float64
 7   powerPerf    1938 non-null   object 
 8   cores        1967 non-null   int64  
 9   testDate     1967 non-null   int64  
 10  socket       1967 non-null   object 
 11  category     1967 non-null   object 
dtypes: float64(4), int64(4), object(4)
memory usage: 199.8+ KB


Convert the numerical values with unit into int

A. PowerPerformance

In [12]:
#convert the memory column to numeric
dfc['powerPerf'] = dfc['powerPerf'].str.replace(',', '').astype('float')
dfc['powerPerf']

1       315.49
3       381.60
4       299.90
5       291.31
8       339.80
         ...  
3727      2.75
3747     31.70
3765       NaN
3796       NaN
3806       NaN
Name: powerPerf, Length: 1967, dtype: float64

Replace the missing data

In [13]:
# get all the missing values
(nulled_10 := 
 dfc.loc[
     :, 
     dfc.drop(
        dfc.dropna(
            axis=1, 
            inplace=False), 
        axis=1)
        .columns].columns)

Index(['TDP', 'powerPerf'], dtype='object')

In [14]:
for nulled in nulled_10:
    #remove the row if the value is null
    dfc[nulled].fillna(dfc[nulled].mean(), inplace=True)

dfc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1967 entries, 1 to 3806
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cpuName      1967 non-null   object 
 1   price        1967 non-null   float64
 2   cpuMark      1967 non-null   int64  
 3   cpuValue     1967 non-null   float64
 4   threadMark   1967 non-null   int64  
 5   threadValue  1967 non-null   float64
 6   TDP          1967 non-null   float64
 7   powerPerf    1967 non-null   float64
 8   cores        1967 non-null   int64  
 9   testDate     1967 non-null   int64  
 10  socket       1967 non-null   object 
 11  category     1967 non-null   object 
dtypes: float64(5), int64(4), object(3)
memory usage: 199.8+ KB


In [15]:
# get the corr
dfc.corr(numeric_only = True)

Unnamed: 0,price,cpuMark,cpuValue,threadMark,threadValue,TDP,powerPerf,cores,testDate
price,1.0,0.6926,-0.232707,0.253788,-0.310359,0.56195,0.241716,0.767044,0.331479
cpuMark,0.6926,1.0,0.055265,0.624744,-0.299743,0.666466,0.527553,0.883971,0.611278
cpuValue,-0.232707,0.055265,1.0,0.194678,0.657488,0.097428,0.042758,-0.016071,0.09079
threadMark,0.253788,0.624744,0.194678,1.0,-0.201755,0.26998,0.64653,0.33487,0.779804
threadValue,-0.310359,-0.299743,0.657488,-0.201755,1.0,-0.141544,-0.286619,-0.27825,-0.283609
TDP,0.56195,0.666466,0.097428,0.26998,-0.141544,1.0,-0.070035,0.674119,0.132323
powerPerf,0.241716,0.527553,0.042758,0.64653,-0.286619,-0.070035,1.0,0.341354,0.718021
cores,0.767044,0.883971,-0.016071,0.33487,-0.27825,0.674119,0.341354,1.0,0.418999
testDate,0.331479,0.611278,0.09079,0.779804,-0.283609,0.132323,0.718021,0.418999,1.0
