In [26]:
## Previous code 
import pandas as pd
df = pd.read_csv("data.csv")
df.head(5)
df.tail(5)
df.shape
df.columns
df = df.rename(columns={
    "processor_gnrtn": "proc_gn",
    "ram_gb": "ram"
})
df.count()
df.info()
df.describe()
print(f"Number of Rows: {df.shape[0]} and Number of Columns: {df.shape[1]}")
df['brand'].unique()
df.nunique()
df.isnull().sum()          

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              823 non-null    object
 1   processor_brand    823 non-null    object
 2   processor_name     823 non-null    object
 3   proc_gn            823 non-null    object
 4   ram                823 non-null    object
 5   ram_type           823 non-null    object
 6   ssd                823 non-null    object
 7   hdd                823 non-null    object
 8   os                 823 non-null    object
 9   os_bit             823 non-null    object
 10  graphic_card_gb    823 non-null    object
 11  weight             823 non-null    object
 12  warranty           823 non-null    object
 13  Touchscreen        823 non-null    object
 14  msoffice           823 non-null    object
 15  Price              823 non-null    int64 
 16  rating             823 non-null    object
 1

brand                0
processor_brand      0
processor_name       0
proc_gn              0
ram                  0
ram_type             0
ssd                  0
hdd                  0
os                   0
os_bit               0
graphic_card_gb      0
weight               0
warranty             0
Touchscreen          0
msoffice             0
Price                0
rating               0
Number of Ratings    0
Number of Reviews    0
dtype: int64

<div class="alert alert-block alert-info">
    Import <b>Libraries</b>
</div>

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

<div class="alert alert-block alert-info">
    1. <b>Data Pre-Processing</b>
</div>

<div class="alert alert-block alert-warning">
    1.1. <b>Manage Null Values</b>
</div>

In [28]:
df.drop(['weight', 'warranty', 'Touchscreen', 'msoffice'], axis=1, inplace=True)

In [29]:
df['Price'].mean()

np.float64(76745.17739975698)

In [30]:
df['Price'].mode().iloc[0]

np.int64(59990)

In [31]:
df['brand'] = df['brand'].fillna(df['brand'].mode()[0])
print(df['brand'])

0        ASUS
1      Lenovo
2      Lenovo
3        ASUS
4        ASUS
        ...  
818      ASUS
819      ASUS
820      ASUS
821      ASUS
822    Lenovo
Name: brand, Length: 823, dtype: object


In [32]:
def fillnaObjectMode(cols):
    for col in cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')

object_cols = [
    'brand', 'processor_brand', 'processor_name', 'processor_gnrtn',
    'ram_gb', 'ram_type', 'ssd', 'hdd', 'os', 'os_bit',
    'graphic_card_gb', 'rating'
]

fillnaObjectMode(object_cols)

In [33]:
def fillnaNumericMode(cols):
    for col in cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 0)

numeric_cols = ['Price', 'Number of Ratings', 'Number of Reviews']
fillnaNumericMode(numeric_cols)

print("\nNull Values After Filling:")
print(df.isnull().sum())

for col in ['ram_gb', 'ssd', 'hdd', 'graphic_card_gb', 'rating']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.extract('(\d+)').fillna(0).astype('int64')

if 'Price' in df.columns:
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0).astype('int64')

numeric_columns = ['Number of Ratings', 'Number of Reviews']
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int64')


Null Values After Filling:
brand                0
processor_brand      0
processor_name       0
proc_gn              0
ram                  0
ram_type             0
ssd                  0
hdd                  0
os                   0
os_bit               0
graphic_card_gb      0
Price                0
rating               0
Number of Ratings    0
Number of Reviews    0
dtype: int64


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              823 non-null    object
 1   processor_brand    823 non-null    object
 2   processor_name     823 non-null    object
 3   proc_gn            823 non-null    object
 4   ram                823 non-null    object
 5   ram_type           823 non-null    object
 6   ssd                823 non-null    int64 
 7   hdd                823 non-null    int64 
 8   os                 823 non-null    object
 9   os_bit             823 non-null    object
 10  graphic_card_gb    823 non-null    int64 
 11  Price              823 non-null    int64 
 12  rating             823 non-null    int64 
 13  Number of Ratings  823 non-null    int64 
 14  Number of Reviews  823 non-null    int64 
dtypes: int64(7), object(8)
memory usage: 96.6+ KB


<div class="alert alert-block alert-warning">
    1.2. <b>Manage Data Types</b>
</div>

In [35]:
df['Price'] = df['Price'].astype('int64')

In [36]:
def convertObjToInt(cols):
    for col in cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

categorical_columns = ['brand', 'processor_brand', 'processor_name', 'processor_gnrtn',
                       'ram_type', 'os', 'os_bit']
convertObjToInt(categorical_columns)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              823 non-null    int64 
 1   processor_brand    823 non-null    int64 
 2   processor_name     823 non-null    int64 
 3   proc_gn            823 non-null    object
 4   ram                823 non-null    object
 5   ram_type           823 non-null    int64 
 6   ssd                823 non-null    int64 
 7   hdd                823 non-null    int64 
 8   os                 823 non-null    int64 
 9   os_bit             823 non-null    int64 
 10  graphic_card_gb    823 non-null    int64 
 11  Price              823 non-null    int64 
 12  rating             823 non-null    int64 
 13  Number of Ratings  823 non-null    int64 
 14  Number of Reviews  823 non-null    int64 
dtypes: int64(13), object(2)
memory usage: 96.6+ KB


In [38]:
df.head(2)

Unnamed: 0,brand,processor_brand,processor_name,proc_gn,ram,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,Price,rating,Number of Ratings,Number of Reviews
0,1,1,1,10th,4 GB,1,0,1024,2,1,0,34649,2,3,0
1,5,1,1,10th,4 GB,1,0,1024,2,1,0,38999,3,65,5


In [39]:
df.to_csv('processed_data.csv', index=False, header=True)

In [40]:
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,brand,processor_brand,processor_name,proc_gn,ram,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,Price,rating,Number of Ratings,Number of Reviews
0,1,1,1,10th,4 GB,1,0,1024,2,1,0,34649,2,3,0
1,5,1,1,10th,4 GB,1,0,1024,2,1,0,38999,3,65,5
2,5,1,1,10th,4 GB,1,0,1024,2,1,0,39999,3,8,1
3,1,1,2,10th,8 GB,1,512,0,2,0,2,69990,3,0,0
4,1,1,0,Not Available,4 GB,1,0,512,2,1,0,26990,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,1,0,10,Not Available,4 GB,1,1024,0,2,1,0,135990,3,0,0
819,1,0,10,Not Available,4 GB,1,1024,0,2,1,0,144990,3,0,0
820,1,0,10,Not Available,4 GB,1,1024,0,2,1,4,149990,3,0,0
821,1,0,10,Not Available,4 GB,1,1024,0,2,1,4,142990,3,0,0
