In [95]:
## Previous code 
import pandas as pd
df = pd.read_csv("data.csv")
df.head(5)
df.tail(5)
df.shape
df.columns
df = df.rename(columns={
    "processor_gnrtn": "proc_gn",
    "ram_gb": "ram"
})
df.count()
df.info()
df.describe()
print(f"Number of Rows: {df.shape[0]} and Number of Columns: {df.shape[1]}")
df['brand'].unique()
df.nunique()
df.isnull().sum()          

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   brand              4900 non-null   object 
 1   processor_brand    5000 non-null   object 
 2   processor_name     4900 non-null   object 
 3   proc_gn            5000 non-null   int64  
 4   ram                4900 non-null   float64
 5   ram_type           5000 non-null   object 
 6   ssd                4900 non-null   float64
 7   hdd                4900 non-null   float64
 8   os                 5000 non-null   object 
 9   os_bit             5000 non-null   int64  
 10  graphic_card_gb    5000 non-null   int64  
 11  rating             5000 non-null   int64  
 12  Number of Ratings  5000 non-null   int64  
 13  Number of Reviews  5000 non-null   int64  
 14  Price              5000 non-null   int64  
dtypes: float64(3), int64(7), object(5)
memory usage: 586.1+ KB
Number of Row

brand                100
processor_brand        0
processor_name       100
proc_gn                0
ram                  100
ram_type               0
ssd                  100
hdd                  100
os                     0
os_bit                 0
graphic_card_gb        0
rating                 0
Number of Ratings      0
Number of Reviews      0
Price                  0
dtype: int64

<div class="alert alert-block alert-info">
    Import <b>Libraries</b>
</div>

In [96]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

<div class="alert alert-block alert-info">
    1. <b>Data Pre-Processing</b>
</div>

<div class="alert alert-block alert-warning">
    1.1. <b>Manage Null Values</b>
</div>

In [97]:
df.drop(['rating', 'Number of Ratings', 'Number of Reviews', 'os', 'os_bit'], axis=1, inplace=True)

In [98]:
df['Price'].mean()

np.float64(151718.9478)

In [99]:
df['Price'].mode().iloc[0]

np.int64(140274)

In [100]:
df['brand'] = df['brand'].fillna(df['brand'].mode()[0])
print(df['brand'])

0        Apple
1         Acer
2          MSI
3        Apple
4       Lenovo
         ...  
4995       MSI
4996      Asus
4997    Lenovo
4998    Lenovo
4999      Dell
Name: brand, Length: 5000, dtype: object


In [101]:
df['ram'] = df['ram'].fillna(df['ram'].mode()[0])
print(df['ram'])

0       16.0
1        8.0
2       16.0
3        8.0
4       32.0
        ... 
4995    32.0
4996    16.0
4997    32.0
4998     8.0
4999     4.0
Name: ram, Length: 5000, dtype: float64


In [102]:
def fillnaObjectMode(cols):
    for col in cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')

object_cols = [
    'brand', 'processor_brand', 'processor_name', 'processor_gnrtn',
    'ram_gb', 'ram_type', 'ssd', 'hdd', 'os', 'os_bit',
    'graphic_card_gb', 'rating'
]

fillnaObjectMode(object_cols)

In [103]:
def fillnaNumericMode(cols):
    for col in cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 0)

numeric_cols = ['Price', 'Number of Ratings', 'Number of Reviews']
fillnaNumericMode(numeric_cols)

print("\nNull Values After Filling:")
print(df.isnull().sum())

for col in ['ram_gb', 'ssd', 'hdd', 'graphic_card_gb', 'rating']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.extract('(\d+)').fillna(0).astype('int64')

if 'Price' in df.columns:
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0).astype('int64')

numeric_columns = ['Number of Ratings', 'Number of Reviews']
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int64')


Null Values After Filling:
brand              0
processor_brand    0
processor_name     0
proc_gn            0
ram                0
ram_type           0
ssd                0
hdd                0
graphic_card_gb    0
Price              0
dtype: int64


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   brand            5000 non-null   object 
 1   processor_brand  5000 non-null   object 
 2   processor_name   5000 non-null   object 
 3   proc_gn          5000 non-null   int64  
 4   ram              5000 non-null   float64
 5   ram_type         5000 non-null   object 
 6   ssd              5000 non-null   int64  
 7   hdd              5000 non-null   int64  
 8   graphic_card_gb  5000 non-null   int64  
 9   Price            5000 non-null   int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 390.8+ KB


<div class="alert alert-block alert-warning">
    1.2. <b>Manage Data Types</b>
</div>

In [105]:
df['Price'] = df['Price'].astype('int64')
df['proc_gn'] = df['proc_gn'].astype(str).str.extract('(\d+)').fillna(0).astype('int64')
df['ram'] = df['ram'].astype(str).str.extract('(\d+)').fillna(0).astype('int64')

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            5000 non-null   object
 1   processor_brand  5000 non-null   object
 2   processor_name   5000 non-null   object
 3   proc_gn          5000 non-null   int64 
 4   ram              5000 non-null   int64 
 5   ram_type         5000 non-null   object
 6   ssd              5000 non-null   int64 
 7   hdd              5000 non-null   int64 
 8   graphic_card_gb  5000 non-null   int64 
 9   Price            5000 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 390.8+ KB


In [107]:
def convertObjToInt(cols):
    for col in cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

categorical_columns = ['brand', 'processor_brand', 'processor_name', 'ram_type', 'os', 'os_bit']
convertObjToInt(categorical_columns)

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   brand            5000 non-null   int64
 1   processor_brand  5000 non-null   int64
 2   processor_name   5000 non-null   int64
 3   proc_gn          5000 non-null   int64
 4   ram              5000 non-null   int64
 5   ram_type         5000 non-null   int64
 6   ssd              5000 non-null   int64
 7   hdd              5000 non-null   int64
 8   graphic_card_gb  5000 non-null   int64
 9   Price            5000 non-null   int64
dtypes: int64(10)
memory usage: 390.8 KB


In [109]:
df.head(2)

Unnamed: 0,brand,processor_brand,processor_name,proc_gn,ram,ram_type,ssd,hdd,graphic_card_gb,Price
0,1,1,7,12,16,0,1024,0,0,130926
1,0,1,0,10,8,0,0,1000,0,89246


In [110]:
df.to_csv('processed_data.csv', index=False, header=True)

In [111]:
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,brand,processor_brand,processor_name,proc_gn,ram,ram_type,ssd,hdd,graphic_card_gb,Price
0,1,1,7,12,16,0,1024,0,0,130926
1,0,1,0,10,8,0,0,1000,0,89246
2,6,0,2,11,16,1,256,0,6,167363
3,1,0,2,10,8,0,128,1000,6,143284
4,5,2,7,9,32,2,512,500,2,181685
...,...,...,...,...,...,...,...,...,...,...
4995,6,0,0,13,32,1,256,500,4,211992
4996,2,0,5,9,16,0,128,0,2,128870
4997,5,1,2,12,32,0,1024,0,0,186082
4998,5,2,6,12,8,0,512,500,2,117771
