In [13]:
# Basic Libraries
import numpy as np
import pandas as pd
import math

In [14]:
df_user1 = pd.DataFrame([['Model 1','255,0,0', 'round','4'], ['Model 2', '0,0,255', 'round','1'], ['Model 3', '0,0,0','cat','3'],['Model 4','255,0,0','oval','5'],
                         ['Model 5', '128,0,128','cat','3.5'],['Model 6', '0,0,0','rectangle','3'],['Model 7', '255,20,147','rectangle','4']],
     columns=['Name', 'Colour','Shape','Rating'])
df_user1

Unnamed: 0,Name,Colour,Shape,Rating
0,Model 1,25500,round,4.0
1,Model 2,255,round,1.0
2,Model 3,0,cat,3.0
3,Model 4,25500,oval,5.0
4,Model 5,1280128,cat,3.5
5,Model 6,0,rectangle,3.0
6,Model 7,25520147,rectangle,4.0


In [15]:
class RGB:
    '''RGB class to handle colour feature'''
    def __init__(self,r,g,b):
        self.r = r
        self.b = b
        self.g = g
'''red = RGB(255,0,0)
blue = RGB(0,0,255)
purple = RGB(128,0,128)
pink = RGB(255,20,147)
black = RGB(0,0,0)
gb = RGB(0,255,255)
green = RGB(0,255,0)'''       


'red = RGB(255,0,0)\nblue = RGB(0,0,255)\npurple = RGB(128,0,128)\npink = RGB(255,20,147)\nblack = RGB(0,0,0)\ngb = RGB(0,255,255)\ngreen = RGB(0,255,0)'

In [16]:
#colour_dist_red function get the colour distance of input colour with red. 
#Colour distance to represent how different/similar two colours are
def colour_dist_red(e1):
    e2 = RGB(255,0,0)
    rmean = (e1.r + e2.r ) // 2
    r = int(e1.r - e2.r)
    g = int(e1.g - e2.g)
    b = int(e1.b - e2.b)
    return math.sqrt((((512+rmean)*r*r)>>8) + 4*g*g + (((767-rmean)*b*b)>>8))/764.84

def colour_dist_green(e1):
    e2 = RGB(0,255,0)
    rmean = (e1.r + e2.r ) // 2
    r = int(e1.r - e2.r)
    g = int(e1.g - e2.g)
    b = int(e1.b - e2.b)
    return math.sqrt((((512+rmean)*r*r)>>8) + 4*g*g + (((767-rmean)*b*b)>>8))/764.84

def colour_dist_blue(e1):
    e2 = RGB(0,0,255)
    rmean = (e1.r + e2.r ) // 2
    r = int(e1.r - e2.r)
    g = int(e1.g - e2.g)
    b = int(e1.b - e2.b)
    return math.sqrt((((512+rmean)*r*r)>>8) + 4*g*g + (((767-rmean)*b*b)>>8))/764.84

In [17]:
colour = df_user1["Colour"]
colour_dist_list = []
for c in colour:
    c_list = list(map(int,c.split(",")))
    #print(c_list)
    colour_list =[]
    colour_RGB = RGB(c_list[0],c_list[1],c_list[2])
    colour_list.append(round(1-colour_dist_red(colour_RGB),5))
    colour_list.append(round(1-colour_dist_green(colour_RGB),5))
    colour_list.append(round(1-colour_dist_blue(colour_RGB),5))
    colour_dist_list.append(colour_list)
    
print(colour_dist_list)

[[1.0, 0.15024, 0.25478], [0.25478, 0.11815, 1.0], [0.47326, 0.33319, 0.42291], [1.0, 0.15024, 0.25478], [0.62753, 0.23543, 0.62753], [0.47326, 0.33319, 0.42291], [0.72321, 0.13546, 0.42551]]


In [18]:
df_user1_RGBSim = pd.DataFrame(colour_dist_list,columns=["sim_R","sim_G","sim_B"])
df_user1_NC = df_user1.drop(columns = "Colour")
df_user1_RGBSim = df_user1_RGBSim.reset_index(drop=True)
df_user1_newC = pd.concat([df_user1_NC,df_user1_RGBSim], axis=1)
df_user1_newC

Unnamed: 0,Name,Shape,Rating,sim_R,sim_G,sim_B
0,Model 1,round,4.0,1.0,0.15024,0.25478
1,Model 2,round,1.0,0.25478,0.11815,1.0
2,Model 3,cat,3.0,0.47326,0.33319,0.42291
3,Model 4,oval,5.0,1.0,0.15024,0.25478
4,Model 5,cat,3.5,0.62753,0.23543,0.62753
5,Model 6,rectangle,3.0,0.47326,0.33319,0.42291
6,Model 7,rectangle,4.0,0.72321,0.13546,0.42551


In [19]:
# Extract Response and Predictors
df_shape = pd.get_dummies(df_user1["Shape"])
df_features = df_user1_newC = pd.concat([df_shape,df_user1_RGBSim], axis=1)
df_rating = pd.DataFrame(df_user1["Rating"])

In [25]:
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Split the Dataset into Train and Test
features_train, features_test, rating_train, rating_test = train_test_split(df_features, df_rating, test_size = 0.25)

# Check the sample sizes
print("Train Set :", rating_train.shape, features_train.shape)
print("Test Set  :", rating_test.shape,features_test.shape)

Train Set : (5, 1) (5, 7)
Test Set  : (2, 1) (2, 7)


In [26]:
# Import LinearRegression model from Scikit-Learn
from sklearn.linear_model import LinearRegression


# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(features_train, rating_train)        # train the linear regression model

# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Print the Coefficients against Predictors
pd.DataFrame(list(zip(features_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"])

Intercept of Regression 	: b =  [1.41431341]
Coefficients of Regression 	: a =  [[ 3.55102598e-01  9.99200722e-16  3.55102598e-01 -7.10205197e-01
   3.52748434e+00 -6.49261464e-01 -5.26130435e-01]]



Unnamed: 0,Predictors,Coefficients
0,cat,0.3551026
1,oval,9.992007e-16
2,rectangle,0.3551026
3,round,-0.7102052
4,sim_R,3.527484
5,sim_G,-0.6492615
6,sim_B,-0.5261304


In [27]:
# Predict the Total values from Predictors
rating_train_pred = linreg.predict(features_train)
rating_test_pred = linreg.predict(features_test)
rating_test_pred

array([[4.00870524],
       [4.7102052 ]])

In [28]:
rating_test_actual = ""
for idx in features_test.index:
    rating_test_actual += df_user1.iloc[idx]['Rating'] + " "
print(f'Actual rating is: {rating_test_actual}')
print()
print(f'Predicted rating is: {rating_test_pred}')

Actual rating is: 4 5 

Predicted rating is: [[4.00870524]
 [4.7102052 ]]


In [29]:
# Import mean_squared_error from sklearn
from sklearn.metrics import mean_squared_error

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(features_train, rating_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(rating_train, rating_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(features_test,rating_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(rating_test,rating_test_pred))
print()

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 1.0
Mean Squared Error (MSE) 	: 3.1554436208840474e-31

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.8318863813508859
Mean Squared Error (MSE) 	: 0.042028404662278515

