# Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree

# Data Preprocessing

In [None]:
df = pd.read_csv("/content/laptop_data.csv").drop('Unnamed: 0', axis=1)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
# dublicate values
df.duplicated().sum()

In [None]:
catvars = df.select_dtypes(include=['object']).columns
numvars = df.select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns

In [None]:
# values Counts in Datasets
def value_counts(x):
  print(f"Number of values in {x}:", df[x].unique())
  print()

In [None]:
for i in catvars:
  value_counts(i)
  print('-'*55)

In [None]:
value_counts("Inches")

In [None]:
df.head(2)

## Processing Ram

In [None]:
# remove gb from ram

df['Ram'] = df['Ram'].str.replace('GB', "")
df['Ram'] = pd.to_numeric(df['Ram'])

# Converting Weight into Float
df['Weight'] = df['Weight'].str.replace('kg', '')
df['Weight'] = df['Weight'].astype('float32')

In [None]:
df.info()

## Data Visualization

In [None]:
sns.displot(df['Price'], color='red', kde=True, bins=50)

In [None]:
# plotting the categorical variable

def drawplot(col):
  plt.figure(figsize=(15, 7))
  sns.countplot(data=df, x=col, palette='plasma')
  plt.xticks(rotation='45')
  plt.show()

to_view = ['Company', 'TypeName', 'Ram', 'OpSys']
for i in to_view:
  drawplot(i)

In [None]:
# avarage price for each of the laptop brand
# this will say us the insight that as per company the price od the laptop vary

plt.figure(figsize=(15, 7))
sns.barplot(data=df, x='Company', y='Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
# type of laptops

plt.figure(figsize=(15, 7))
sns.barplot(data=df, x='TypeName', y='Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(data=df, x='Inches', y='Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
df['ScreenResolution'].value_counts()

In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda e: 1 if 'Touchscreen' in e else 0)

In [None]:
df.tail()

## Working with Screen resolution

In [None]:
sns.countplot(data=df, x='Touchscreen')
plt.show()

In [None]:
sns.barplot(data=df, x='Touchscreen', y='Price')
plt.show()

In [None]:
df['IPS'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

In [None]:
sns.countplot(data=df, x='IPS')
plt.show()

In [None]:
sns.barplot(data=df, x='IPS', y='Price')
plt.show()

In [None]:
# Extracting resolution

splitdf = df['ScreenResolution'].str.split('x', n=1, expand=True)

In [None]:
splitdf.head()

In [None]:
x_reg = splitdf[0].str.split(' ')

In [None]:
df['X_reg'] = x_reg.apply(lambda x: x[-1]).astype('int32')
df['Y_reg'] = splitdf[1].astype('int32')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(14, 7))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
df.corr()['Price']

In [None]:
df['PPI'] = (((df['X_reg']**2+df['Y_reg']**2)**0.5)/df['Inches']).astype('float')

In [None]:
df.head()

In [None]:
df.corr()['Price']

In [None]:
df.drop(["ScreenResolution", "Inches", "X_reg", "Y_reg"], axis=1, inplace=True)

In [None]:
df.head()

## Working with CPU

In [None]:
df['Cpu'].value_counts()

In [None]:
df['CPU_Name'] = df['Cpu'].apply(lambda x: " ".join( x.split()[:3]))

In [None]:
df['CPU_Name'].head()

In [None]:
def processorType(x):
  if x == "Intel Core i5" or x == "Intel Core i7" or x == "Intel Core i3":
    return x
  elif x.split()[0] == 'Intel':
    return "Intel Other Processor"
  else:
    return "AMD Processor"

In [None]:
df['CPU_Name'] = df['CPU_Name'].apply(processorType)

In [None]:
df['CPU_Name'].value_counts()

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=df, x='CPU_Name', y='Price')
plt.show()

In [None]:
# droping cpu
df.drop('Cpu', axis=1, inplace=True)

In [None]:
df.head()

## Ram Analysis

In [None]:
sns.countplot(data=df, x='Ram')

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=df, x='Ram', y='Price')
plt.show()

## Memory 

In [None]:
df['Memory'].value_counts()

In [None]:
## 4 most common variable observed : HDD, SSD, Hybrid, Flash

df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)

# replce gb to ""
df['Memory'] = df['Memory'].str.replace('GB', '')

# replace tb to 000
df["Memory"] = df['Memory'].str.replace('TB', '000')

In [None]:
newdf = df['Memory'].str.split('+', n=1, expand=True)

In [None]:
newdf

In [None]:
df['Memory'] = newdf[0].apply(lambda x: x.strip(' '))

In [None]:
df.head()

In [None]:
def applyChange(value):
  df['Layers1'+value] = df['Memory'].apply(lambda x: 1 if value in x else 0)


listToApply = ['SSD', 'HDD', 'Hybrid', 'Flash Storage']

for i in listToApply:
  applyChange(i)

In [None]:
df['Memory'] = df['Memory'].str.replace(f'\D', '', regex=True)

In [None]:
df['Memory'].value_counts()

In [None]:
df['Second_Memory'] = newdf[1]

In [None]:
df['Second_Memory'].fillna("0",inplace=True)

In [None]:
def applyChange(value):
  df['Layers2'+value] = df['Second_Memory'].apply(lambda x: 1 if value in x else 0)


listToApply = ['SSD', 'HDD', 'Hybrid', 'Flash Storage']

for i in listToApply:
  applyChange(i)

In [None]:
df['Second_Memory'] = df['Second_Memory'].str.replace('\D', '', regex=True)

In [None]:
df['Memory'] = df['Memory'].astype('int')
df['Second_Memory'] = df['Second_Memory'].astype('int')

In [None]:
# Multiplying the elements and string the result in subsequent columns

df['SSD'] = (df['Memory']*df['Layers1SSD']+df['Second_Memory']*df['Layers2SSD'])
df['HDD'] = (df['Memory']*df['Layers1HDD']+df['Second_Memory']*df['Layers2HDD'])
df['Hybrid'] = (df['Memory']*df['Layers1Hybrid']+df['Second_Memory']*df['Layers2Hybrid'])
df['Flash_Storage'] = (df['Memory']*df['Layers1Flash Storage']+df['Second_Memory']*df['Layers2Flash Storage'])

df.replace('', 0, inplace=True)

In [None]:
df.drop(['Memory', 'Second_Memory', 'Layers1SSD', 'Layers2SSD', 'Layers1HDD',
         'Layers2HDD', 'Layers1Hybrid', 'Layers2Hybrid', 'Layers1Flash Storage',
         'Layers2Flash Storage'], axis=1, inplace=True)

In [None]:
df.corr()['Price']

In [None]:
df.drop(['Hybrid'], axis=1, inplace=True)

## Gpu Preprocess

In [None]:
df['Gpu'].value_counts()

In [None]:
df['Gpu_Brand'] = df['Gpu'].apply(lambda x: x.split(' ')[0])

In [None]:
sns.countplot(data=df, x='Gpu_Brand')
plt.show()

In [None]:
sns.barplot(data=df, x='Gpu_Brand', y='Price', estimator=np.mean)
plt.show()

In [None]:
df.drop(['Gpu'], axis=1, inplace=True)

## Insight About Operation system

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='OpSys')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='OpSys', y='Price')
plt.show()

In [None]:
df['OpSys'].value_counts()

In [None]:
def precessOs(x):
  if x == "macOS" or x == "Mac OS X":
    return 'MacOS'
  elif x == "Windows 10" or x == "Windows 7" or x == "Windows 10 S":
    return 'Windows'
  else:
    return "OtherOs"

df['OpSys'] = df['OpSys'].apply(precessOs)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='OpSys', y='Price')
plt.show()

## Price Analysis

In [None]:
sns.displot(x=df["Price"])

In [None]:
sns.displot(x=np.log(df["Price"]))

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
# change Price into log price
y = np.log(df['Price'].values)
x = df.drop(['Price'], axis=1)

In [None]:
x_hot = pd.get_dummies(x, drop_first=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_hot.values, y, test_size=0.2, random_state=2)

## Linear Regression

In [None]:
model_lr = LinearRegression()
model_lr.fit(x_train, y_train)

In [None]:
y_pred = model_lr.predict(x_test)

In [None]:
print(metrics.r2_score(y_test, y_pred))

In [None]:
print(metrics.mean_squared_error(y_test, y_pred))

In [None]:
np.exp(0.06265013765425528)

## Ridge Regression

In [None]:
model_rg = Ridge(alpha=10)

model_rg.fit(x_train, y_train)

In [None]:
model_rg.score(x_train, y_train)

In [None]:
y_pred = model_rg.predict(x_test)
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

## Lasso regression

In [None]:
model_l = Lasso(alpha=0.0001)
model_l.fit(x_train, y_train)


In [None]:
print('training score', model_l.score(x_train, y_train))

In [None]:
y_pred = model_l.predict(x_test)
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

## Decision Tree

In [None]:
model_dt = DecisionTreeRegressor(max_depth=8)
model_dt.fit(x_train, y_train)

In [None]:
print('training score', model_dt.score(x_train, y_train))
y_pred = model_dt.predict(x_test)
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

## Random Forest

In [None]:
model_dt = RandomForestRegressor(
    n_estimators=100,
    random_state=3,
    max_samples=0.5,
    max_features=0.75,
    max_depth=15
)
model_dt.fit(x_train, y_train)

In [None]:
print('training score', model_dt.score(x_train, y_train))
y_pred = model_dt.predict(x_test)
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

## Hyperparameter Tuning

In [None]:
reg = DecisionTreeRegressor(random_state=0)
reg.fit(x_train, y_train)
plt.figure(figsize=(16, 9))
tree.plot_tree(reg, filled=True, feature_names=x_hot.columns)
plt.show()

In [None]:
path = reg.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas = path.ccp_alphas

In [None]:
alphalist = []
for alpha in ccp_alphas:
  reg = DecisionTreeRegressor(random_state=0, ccp_alpha=alpha)
  reg.fit(x_train, y_train)
  alphalist.append(reg)

In [None]:
train_score = [reg.score(x_train, y_train) for reg in alphalist]
test_score = [reg.score(x_test, y_test) for reg in alphalist]

plt.xlabel('ccp Alpha')
plt.ylabel('Accuracy')
plt.plot(ccp_alphas, train_score, marker='o', label='training', color='magenta')
plt.plot(ccp_alphas, test_score, marker='+', label='testing', color='red')
plt.legend()
plt.show()



In [None]:
reg = DecisionTreeRegressor(random_state=0, ccp_alpha=0.0085)
reg.fit(x_train, y_train)
plt.figure(figsize=(16, 9))
tree.plot_tree(reg, filled=True, feature_names=x_hot.columns)
plt.show()

In [None]:
params = {
    'RandomForest': {
        "model": RandomForestRegressor(),
        'params':{
            'n_estimators': [int(x) for x in np.linspace(100, 1200, 10)],
            'criterion': ['squared_error', 'absolute_error'],
            'max_depth': [int(x) for x in np.linspace(1, 30, 5)],
            "max_features": ['auto', 'sqrt', 'log2'],
            'ccp_alpha': [x for x in np.linspace(0.0025, 0.0125, 5)],
            'min_samples_split': [2, 5, 10, 14],
            'min_samples_leaf': [2, 5, 10, 14],
        }
    },
    'Decision Tree': {
        "model": DecisionTreeRegressor(),
        'params':{
            'criterion': ['squared_error', 'absolute_error'],
            'max_depth': [int(x) for x in np.linspace(1, 30, 5)],
            "max_features": ['auto', 'sqrt', 'log2'],
            'ccp_alpha': [x for x in np.linspace(0.0025, 0.0125, 5)],
            'min_samples_split': [2, 5, 10, 14],
            'min_samples_leaf': [2, 5, 10, 14],
        }
    }
}

In [None]:
RandomizedSearchCV,
score = []
for modelname, mp in params.items():
  clf = RandomizedSearchCV(mp['model'],
                           param_distributions=mp['params'], cv=5,
                           n_iter=10, scoring='neg_mean_squared_error', verbose=2)
  clf.fit(x_train, y_train)
  score.append({
      'model_name':modelname,
      'best_score': clf.best_score_,
      'best_estimator': clf.best_estimator_
  })

In [None]:
score

In [None]:
model_1 = RandomForestRegressor(ccp_alpha=0.0025, criterion='absolute_error', max_depth=8,
                        max_features='log2', min_samples_leaf=5,
                        min_samples_split=10, n_estimators=833)

In [None]:
model_2 = RandomForestRegressor(n_estimators=1000,
    random_state=3,
    max_samples=0.5,
    max_features=0.75,
    max_depth=15)

In [None]:
model_2.fit(x_train, y_train)

In [None]:
model_2.score(x_train, y_train)

In [None]:
model_2.score(x_test, y_test)

# prediction hole datasets

In [None]:
%%time
predicted = []
for i in range(x_hot.shape[0]):
  pred = model.predict([x_hot.values[i]])[0]
  predicted.append(pred)

In [None]:
predicted_price = np.exp(np.array(predicted))

In [None]:
sns.displot(x=df['Price'], kind='kde', color='orange', label='price')
sns.displot(x=predicted_price, kind='kde', color='blue', label='predicted price')
plt.legend()
plt.show()

In [None]:
import pickle
file = open('laptopprice.pkl', 'wb')
pickle.dump(model_2, file)
file.close()