In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt

import cv2

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


from sklearn.preprocessing import PolynomialFeatures

import sklearn.metrics as sm

from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("datahb.csv")

cols = ["Gender", "Red", "Blue" ,"Green", "Hb"]
df.head()

Unnamed: 0,Name,Gender,Red,Green,Blue,Hb
0,Shahalam,M,47.6372,26.7953,25.5675,15.1
1,Md.Ismail,M,45.6906,30.0735,24.2359,9.1
2,Saiful,M,43.4098,31.2315,25.3586,12.9
3,Yamin,M,44.4252,28.7436,26.8312,14.7
4,Tarek,M,44.3297,28.7617,26.9086,14.1


In [None]:
def encoder(series):
  le = preprocessing.LabelEncoder()
  series = le.fit_transform(series)

  return series

In [None]:
# cleaning the data
def clean(data,cols):

  data = data.drop(["Name"],axis = 1)

  for col in cols:
    data[col].fillna(data[col].mean(),inplace = True)

    return data

In [None]:
df["Gender"] = encoder(df["Gender"])

In [None]:
df = clean(df,cols)
df.head()


Unnamed: 0,Gender,Red,Green,Blue,Hb
0,1,47.6372,26.7953,25.5675,15.1
1,1,45.6906,30.0735,24.2359,9.1
2,1,43.4098,31.2315,25.3586,12.9
3,1,44.4252,28.7436,26.8312,14.7
4,1,44.3297,28.7617,26.9086,14.1


#Feature Engineering

In [None]:
def get_Xy(data, Target, inp):

  if inp == "F":
    X = data[(data["Gender"] == 0)]
    y = data[(data["Gender"] == 0)]["Hb"]
  else:
    X = data[(data["Gender"] == 1)]
    y = data[(data["Gender"] == 1)]["Hb"]

  # # y = data["Hb"]
  # y = data["Hb"]
  # # X = data.drop("Gender", axis = 1)

  X = X.drop(Target, axis = 1)

  return X,y

In [None]:
X,y = get_Xy(df,"Hb","F")

#Adding new Feature suggested by Taff

In [None]:
X["BG5"] = X["Blue"]*X["Green"]**5
X["RG5"] = X["Red"]*X["Green"]**5
X["R3"] = X["Red"]**3

X = X.drop("Blue", axis = 1)
X

Unnamed: 0,Gender,Red,Green,BG5,RG5,R3
30,0,47.1003,28.3195,447725200.0,857929500.0,104489.107582
36,0,50.3562,29.2206,435080300.0,1072750000.0,127690.57696
37,0,43.9224,29.8338,620251700.0,1038075000.0,84734.093605
38,0,45.0693,29.8506,594421600.0,1068184000.0,91546.646169
41,0,38.7968,31.3618,905365800.0,1177066000.0,58396.620968
42,0,45.4201,29.4684,558027300.0,1009325000.0,93701.006982
43,0,44.1609,29.5993,596169000.0,1003333000.0,86121.928688
44,0,44.1609,29.5993,596169000.0,1003333000.0,86121.928688
45,0,42.5892,31.5405,807503400.0,1329359000.0,77249.992681
46,0,46.9597,28.4993,461384700.0,882869000.0,103556.160831


In [None]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X , y , test_size = 0.2, random_state = None)

print(type(X_val))

<class 'pandas.core.frame.DataFrame'>


#Pipeline

In [None]:
steps = [
    ('scalar', StandardScaler()), # scales the data


    ('poly_features', PolynomialFeatures(degree=3)), # adds polynomial features till degree 3


    ('regularization', Ridge(alpha=5, fit_intercept= True))

    ]

In [None]:
clf = Pipeline(steps)

clf = clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_val)
accuracy = round(sm.r2_score(y_val, y_pred), 2)

print("The R2 score is",accuracy, "(Best possible score is 1.0)")

The R2 score is -0.54 (Best possible score is 1.0)


#Training done till here

In [None]:
# Function to get RGB % of an image
def rgb_percentage(string):
  red_count = 0
  blue_count = 0
  green_count = 0

  lower_red = np.array([50, 50, 102], dtype = "uint8")
  upper_red = np.array([255, 255, 144], dtype = "uint8")

  lower_blue = np.array([110, 50, 50], dtype = "uint8")
  upper_blue = np.array([130, 255, 255], dtype = "uint8")

  lower_green = np.array([50, 108, 50], dtype = "uint8")
  upper_green = np.array([255, 130, 255], dtype = "uint8")

  image = cv2.imread(string)  # string is image name

  green_mask = cv2.inRange(image, lower_green, upper_green)
  for row in green_mask:
    for col in row:
      if col !=0:
        green_count = green_count +1

  blue_mask = cv2.inRange(image, lower_blue, upper_blue)
  for row in blue_mask:
    for col in row:
      if col !=0:
        blue_count = blue_count +1

  red_mask = cv2.inRange(image, lower_red, upper_red)
  for row in red_mask:
    for col in row:
      if col !=0:
        red_count = red_count +1



  sum = red_count + blue_count + green_count
  red_px_p = red_count*100/sum
  blue_px_p = blue_count*100/sum
  green_px_p = green_count*100/sum

  return red_px_p, green_px_p, blue_px_p



  # green_mask= green_mask.tolist()
  # for i in range(len(green_mask)):
  #   for j in range(len(green_mask[i])):
  #     if green_mask[i][j] != 0:
  #       green_count = green_count + 1

  # blue_mask= blue_mask.tolist()
  # for i in range(len(blue_mask)):
  #   for j in range(len(blue_mask[i])):
  #     if blue_mask[i][j] != 0:
  #       blue_count = blue_count + 1



  # red_mask= red_mask.tolist()
  # for i in range(len(red_mask)):
  #   for j in range(len(red_mask[i])):
  #     if red_mask[i][j] != 0:
  #       red_count = red_count + 1

In [None]:
# Function for converting real time inputs into dataframe
def new_df( gender, red, green, blue):
    feature_1 = blue*pow(green,5)
    feature_2 = red*pow(green,5)
    red_corr = pow(red,3)
    data_add = (red, green, gender, feature_1, feature_2, red_corr)



    return data_add


In [None]:
img = "noorain.jpg"

In [None]:
red_p , green_p , blue_p = rgb_percentage(img)
# x_test = x_test.tolist()
gender = 0
x_sub = np.asarray(new_df(gender, red_p, green_p, blue_p))
x_sub = x_sub.reshape(1,-1)

x_sub = pd.DataFrame(x_sub, columns = ["Gender" , "Red", "Green", "BG5", "RG5", "R3"], index = [1])


print(np.asarray(x_sub))
y_pred = clf.predict(x_sub)
print(red_p,
green_p,
blue_p)
print(y_pred)

[[5.29224001e+01 3.03087733e+01 0.00000000e+00 4.28888569e+08
  1.35357190e+09 1.48224023e+05]]
52.922400147164694 30.308773332303787 16.76882652053152
[1292.32304704]
