# Regression - Laptop Price

In [None]:
#import libraries to be used 
import pandas as pd
import numpy as np
import patsy
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv("laptop_price.csv") #read a comma-separated values file into dataframe

In [None]:
df.drop(columns="laptop_ID" , inplace= True)  #drop specified labels from rows or columns

In [None]:
df.rename(columns= {'Price_euros': 'Price'}, inplace = True)  #rename The Column

In [None]:
df["TypeName"]=df["TypeName"].str.replace("Netbook","Notebook") #Replace duplicated names in typename column

In [None]:
df["Touchscreen"]= df["ScreenResolution"].apply(lambda x:1 if "Touchscreen" in x else 0) 

In [None]:
screen = df["ScreenResolution"].str.split("x",n=1,expand=True) #

In [None]:
df["sc_x"] = screen[0]
df["sc_y"] = screen[1]

In [None]:
df["sc_x"]=df["sc_x"].str.replace(",","").str.findall(r"(\d\d+)").apply(lambda x:x[0])

In [None]:
 #change type from str to float
df["sc_x"]=df['sc_x'].astype(float)  
df["sc_y"]=df['sc_y'].astype(float)


In [None]:
df["IPS"]=df["ScreenResolution"].apply(lambda x:1 if "IPS" in x else 0) #check if IPS in ScreenResolution

In [None]:
df['Weight'] = df['Weight'].str.replace('kg','')

In [None]:
df['Weight']= df['Weight'].astype(float) #change type from str to float 


In [None]:
df['Ram'] = df['Ram'].str.replace('GB','') 

In [None]:
df['ppi'] = (((df['sc_x']**2) + (df['sc_y']**2))**0.5/df['Inches'])

In [None]:
df.drop(columns="Inches" , inplace= True)

In [None]:
def Cpus (x):
    cpuName = " ".join(x.split()[0:3])
    if cpuName == "Intel Core i7" or cpuName == "Intel Core i5" or cpuName == "Intel Core i3" :
        return cpuName
    elif cpuName.split()[0] == "Intel":
        return "other Intel"
    elif cpuName.split()[0] == "AMD":
        return 'AMD'
    else :
        return 'Samsung '
    
df["cpuName"]=df["Cpu"].apply(lambda x:Cpus (x))

In [None]:
df.drop(columns="Cpu" , inplace= True)

In [None]:
df.drop(columns="sc_x" , inplace= True)
df.drop(columns="sc_y" , inplace= True)

In [None]:
df.drop(columns="ScreenResolution" , inplace= True)

In [None]:

def aos(x): 
    if x == 'Windows 10' or x == 'Windows 7' or x == 'Windows 10 S':
        return 'Windows'
    elif x == 'macOS' or x == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others OS/Linux'
df['os'] = df['OpSys'].apply(aos)

In [None]:
df['Ram']=df['Ram'].astype(int)

In [None]:
df['HDD']=df['Memory'].str.extract('(\d\d\d?GB\sHDD|\dTB\sHDD|\d\.0TB\sHDD)',expand=True)
df['HDD']=df['HDD'].str.extract('(\d\d\d?|\dTB|\d\.0TB)',expand=True)
df['HDD']=df['HDD'].str.replace('(TB|\.0TB)','000',regex=True)
df['HDD'].fillna(0,inplace=True)


In [None]:
df['SSD']=df['Memory'].str.extract('(\d\d\d?GB\sSSD|\dTB\sSSD|\d\.0TB\sSSD)',expand=True)
df['SSD']=df['SSD'].str.extract('(\d\d\d?|\dTB|\d\.0TB)',expand=True)
df['SSD']=df['SSD'].str.replace('(TB|\.0TB)','000',regex=True)
df['SSD'].fillna(0,inplace=True)


In [None]:
df['Flash']=df['Memory'].str.extract('(\d\d\d?GB\sFlash Storage|\dTB\sFlash Storage|\d\.0TB\sSSD)',expand=True)
df['Flash']=df['Flash'].str.extract('(\d\d\d?|\dFlash Storage|\d\.0TB)',expand=True)
df['Flash']=df['Flash'].str.replace('(TB|\.0TB)','000',regex=True)
df['Flash'].fillna(0,inplace=True)


In [None]:
df['Hybrid']=df['Memory'].str.extract('(\d\d\d?GB\sHybrid|\dTB\sSSD|\d\.0TB\sHybrid)',expand=True)
df['Hybrid']=df['Hybrid'].str.extract('(\d\d\d?|\dTB|\d\.0TB)',expand=True)
df['Hybrid']=df['Hybrid'].str.replace('(TB|\.0TB)','000',regex=True)
df['Hybrid'].fillna(0,inplace=True)


In [None]:
df['HDD']=df['HDD'].astype(int)
df['SSD']=df['SSD'].astype(int)
df['Hybrid']=df['Hybrid'].astype(int)
df['Flash']=df['Flash'].astype(int)

In [None]:
df.drop(['Memory','Gpu'], axis =1 , inplace =True)

# What is the average price of laptops per company?

In [None]:
avg_price=df.groupby('Company', as_index=False)['Price'].mean()
avg_price

In [None]:
fig = px.bar(avg_price,x='Company',y='Price')
fig


# What is the minimum weight of laptops for each company ?

In [None]:
min_w=df.groupby('Company')[['Weight']].min().reset_index().sort_values(by="Weight",ascending=True)


In [None]:
min_w.head()

In [None]:
fig = px.bar(min_w,x='Company',y='Weight')
fig


# What is the company of the most expensive laptops?

In [None]:
exp_laptop = df.groupby('Company')[['Price']].max()
exp_laptop = exp_laptop.sort_values('Price',ascending = False).reset_index()
exp_laptop

In [None]:
df[df['Price']== df['Price'].max()]


In [None]:
fig = px.bar(exp_laptop,x='Company',y='Price')

fig


# What is the common hard drive used for every laptop?

In [None]:
a = ["SSD","HHD"]

In [None]:
for x in a :
    if a== 0:
        a.count()
        print(a)

# Which brand is the most frequent in the dataframe?

In [None]:
d=df["Company"].value_counts() 

In [None]:
d

In [None]:
px.bar(d,x='Company')


# Algorithm


In [None]:
Company = pd.get_dummies(df['Company'], drop_first=True)

In [None]:
os = pd.get_dummies(df['os'], drop_first=True)

In [None]:
TypeName = pd.get_dummies(df['TypeName'], drop_first=True)

In [None]:
cpuName=pd.get_dummies(df["cpuName"],drop_first=True)

In [None]:
#Company=pd.get_dummies(df["Company"],drop_first=True)

In [None]:
df_u = pd.concat([df,cpuName,TypeName,os,Company],axis = 1)

In [None]:
df_u.drop(["Product","TypeName","OpSys","cpuName","os","Company"], axis =1 , inplace =True)

In [None]:
df_u

In [None]:
df_u.rename(
    inplace=True,
    columns={
        "Intel Core i7": "Intel_Core_i7",
        "other Intel":"other_Intel",
        "Intel Core i3":"Intel_Core_i3"
    })

In [None]:
df_u.corr()['Price'].sort_values(ascending=False)

In [None]:
X=df_u.drop("Price" ,axis =1)
y=df_u["Price"]

# Experiment 1: Linear Regression


In [None]:
def split_and_validate(X, y):
   
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)

    #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    val_score = lr_model.score(X_train, y_train)
    val_score1 = lr_model.score(X_val, y_val)
    val_score3 = lr_model.score(X_test, y_test)
   
    
    # report results
    print('\nValidation R^2 score was train:', val_score1)
    print('\nValidation R^2 score was val:', val_score)
    print('\nValidation R^2 score was test:', val_score3)


In [None]:
split_and_validate(X, y)

In [None]:
X2 = X.copy()

X2['weight'] = X2['Weight'] ** 2


split_and_validate(X2, y)

In [None]:
X3 = X2.copy()

# multiplicative interaction
X3['12']   = X3['Ram'] * X3['Intel_Core_i7']
X3['132']  = X3['Ram'] * X3['ppi']
X3['1322'] = X3['Intel_Core_i7'] * X3['ppi']
X3['13252']= X3['Razer'] * X3['Gaming']

split_and_validate(X3, y)

# Experiment 2:OLS Regression Results


In [None]:
model = sm.OLS(y, X)
fit = model.fit()
fit.summary()

# Experiment 3: Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:

df_u = DecisionTreeRegressor()
df_u.fit(X,y)
df_u.score(X,y)