In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [None]:
df = pd.read_csv('laptopData.csv')
df = df[df.columns[1:]]
df

In [None]:
df.info()

In [None]:
print(f'Null Values: {df.isna().sum()}')
df = df.dropna()
print(f'Null Values: {df.isna().sum()}')

In [None]:
memory = df['Memory'].str.split(' ', expand=True)[[0,1,4,5]].rename(columns={0:'Memory_1', 1:'Memory_Type_1', 4:'Memory_2', 5:'Memory_Type_2'})
df = df.join(memory).drop(columns='Memory')
df = df.fillna(np.nan)
df

In [None]:
df['Memory_Size_1'] = np.where(df['Memory_1'].str.endswith('TB'), 1, 0)
df['Memory_Size_2'] = np.where(df['Memory_2'].str.endswith('TB'), 1, 0)

df['Memory_1'] = df['Memory_1'].str[:-2].replace('', np.nan).astype(float)
df['Memory_2'] = df['Memory_2'].str[:-2].replace('', np.nan).astype(float)

df.loc[df['Memory_Size_1'] == 1, 'Memory_1'] *= 1000
df.loc[df['Memory_Size_2'] == 1, 'Memory_2'] *= 1000

df = df.drop(columns=['Memory_Size_1', 'Memory_Size_2'])


In [None]:
df

In [None]:
# print(df['Weight'].unique())
df['Weight'] = df['Weight'].str[:-2].replace('',np.nan).astype(float)

df['Ram'] = df['Ram'].str[:-2].replace('',np.nan).astype(float)

# print(df['Inches'].unique())
df['Inches'] = df['Inches'].replace('?',np.nan).astype(float)

df

In [None]:
df['Price'].hist(bins=60)
plt.title("Price Distribution")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

In [None]:
df['Company'].value_counts().plot(kind='bar')
plt.title("Brand Counts")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.boxplot(y='Price', x='Ram', data=df.sort_values(by='Ram'))

In [None]:
sns.boxplot(x='Company', y='Price', data=df)
plt.xticks(rotation=45)

In [None]:
df.groupby('Company')['Price'].mean().sort_values().plot(kind='barh')


In [None]:
categorical_columns = ['Company','Cpu', 'Gpu', 'OpSys', 'Memory_Type_1', 'Memory_Type_2','ScreenResolution']

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

df

In [None]:
corr_matrix = df.corr(numeric_only=True)

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
df = df.drop(['TypeName','Memory_2','Memory_Type_2','Company'], axis=1)


In [None]:
df = df.fillna('Unkown')
df_encoded = pd.get_dummies(df, drop_first=True)

In [None]:
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):,.0f}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
min_val = min(min(y_test), min(y_pred))
max_val = max(max(y_test), max(y_pred))
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Prediction')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted')
plt.legend()
plt.grid(True)
plt.show()