<a href="https://colab.research.google.com/github/Ste881/Laptop-price-prediction/blob/main/laptop_ML_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Exploration and Understanding:

  * Dive into the dataset to understand the landscape of laptop specifications.

  * Visualize trends in laptop prices and identify potential influential features.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
df = pd.read_csv('/content/laptop.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


# Data Preprocessing:

  Handle missing values, outliers, and encode categorical variables.

  Ensure the dataset is ready for model training.

  ---
  1. In order to handle missing values, outliers and to decode the encoded categorical variables, we have to clean the data set by removing the extra bit.
  And also we have to see the types of values present in columns.

  ---

  2. Price factor could depend on OpSys/Company and Gpu as well (just a guess).

  ---
  3. Then visualise the cleaned data.

  ---

In [None]:
#Removing unnecessary column
eliminate = ['Unnamed: 0.1','Unnamed: 0']
df.drop(columns= eliminate, inplace=True)
df

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0000
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.3360
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.6400
1299,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.7200
1300,Lenovo,Notebook,14,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.1200
1301,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.9200


In [None]:
# Function to extract numbers
def extract_numbers(text):
    if isinstance(text, str):
        # Extract numbers
        numbers = re.findall(r'\d+x\d+|\d*\.?\d+', text)
        # Join the extracted numbers into a single string
        extracted_numbers = ' '.join(numbers)
        return extracted_numbers
    else:
        return ''

# Apply the function to the diff columns
df['Ram'] = df['Ram'].apply(extract_numbers)
df['Weight'] = df['Weight'].apply(extract_numbers)

df = df.rename(columns={'Ram': 'Ram_Gb', 'Weight': 'Weight_Kg'})

df

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram_Gb,Memory,Gpu,OpSys,Weight_Kg,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0000
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.3360
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows 10,1.8,33992.6400
1299,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows 10,1.3,79866.7200
1300,Lenovo,Notebook,14,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5,12201.1200
1301,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19,40705.9200


In [95]:
def convert_to_categorical(df, columns):
    # Convert specified columns to categorical
    for col in columns:
        df[col] = df[col].replace('?', None)  # Replace '?' with None
        df[col] = df[col].fillna(df[col].mode()[0]) #Replace None with mode
        df[col] = df[col].astype('category')  # Convert to categorical
    return df

def convert_to_numeric(df, columns):
    # Convert specified columns to numeric
    for col in columns:
        df[col] = pd.to_numeric(df[col].replace('?', None), errors='coerce')  # Replace '?' with None
        df[col] = df[col].fillna(df[col].dropna().mean() if df[col].notna().any() else 0)  # Replace None with mean
    return df

cat_columns = ['Company', 'TypeName', 'ScreenResolution','Cpu', 'Memory', 'Gpu', 'OpSys']
num_columns = ['Inches', 'Ram_Gb', 'Weight_Kg', 'Price']
df = convert_to_categorical(df, cat_columns)
df = convert_to_numeric(df, num_columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Company           1303 non-null   category
 1   TypeName          1303 non-null   category
 2   Inches            1303 non-null   float64 
 3   ScreenResolution  1303 non-null   category
 4   Cpu               1303 non-null   category
 5   Ram_Gb            1303 non-null   float64 
 6   Memory            1303 non-null   category
 7   Gpu               1303 non-null   category
 8   OpSys             1303 non-null   category
 9   Weight_Kg         1303 non-null   float64 
 10  Price             1303 non-null   float64 
dtypes: category(7), float64(4)
memory usage: 63.7 KB


In [96]:
null_count = df.isnull().sum()
total_null_count = null_count.sum()
print('Number of null values:\n')
for column, count in zip(null_count.index.str.strip(), null_count.values):
    print(f"{column}: {count}")
print('\nTotal null count:', total_null_count)


Number of null values:

Company: 0
TypeName: 0
Inches: 0
ScreenResolution: 0
Cpu: 0
Ram_Gb: 0
Memory: 0
Gpu: 0
OpSys: 0
Weight_Kg: 0
Price: 0

Total null count: 0
