In [154]:
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import cv2
import pickle

In [155]:
celebDf  = pd.read_csv("Celeb.csv")
usDf = pd.read_csv("Hamza.csv")

In [156]:
#Get Celeb data and then your image data
#Concatnate the two dataframes to get a joint train, test split
celeb_train, celeb_test = train_test_split(celebDf, test_size=0.3, random_state = 8)
us_train, us_test = train_test_split(usDf, test_size=0.3, random_state = 8)

frames = [celeb_train, us_train]
train = pd.concat(frames)

frames = [celeb_test, us_test]
test = pd.concat(frames)

In [157]:
#Seperate the y_train and y_test. 
#We can have different objectives/y
y1_train = train["Identity"].to_numpy()
y2_train = train["Expression"].to_numpy()
y3_train = train["Age"].to_numpy()
y1_test = test["Identity"].to_numpy()
y2_test = test["Expression"].to_numpy()
y3_test = test["Age"].to_numpy()

In [158]:
#Get the train set image paths and convert it into a numpy array
#train_paths = train["Path"].to_numpy()
#Iterate over each image path, open the image, convert to numpy array, resize the image, 
#and reshape it into a row vector
#Append to a list
#Convert it back to numpy array

train_paths = train["Path"]
X_train = []
for path in train_paths:
    image = cv2.imread(path)
    new_image = cv2.resize(image, (32, 32)) 
    rows, columns, channels = new_image.shape
    new_image = new_image.reshape(rows * columns * channels)
    
    X_train.append(new_image / 255)
    
X_train = np.array(X_train)

In [159]:
#Do the same as the train split path
test_paths = test["Path"].to_numpy()

X_test = []
for path in test_paths:
    image = cv2.imread(path)
    new_image = cv2.resize(image, (32, 32)) 
    rows, columns, channels = new_image.shape
    new_image = new_image.reshape(rows * columns * channels)
    
    X_test.append(new_image / 255)
        
X_test = np.array(X_test)

In [160]:
X_train.shape

(140, 3072)

In [161]:
X_test.shape

(60, 3072)

# Linear Regression


In [162]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

model1 = LinearRegression()
model1 = model1.fit(X_train, y3_train)

In [163]:
print("The coefficients are:",model1.coef_)
print("The coefficients are:",model1.intercept_)

The coefficients are: [-0.48741445 -0.56560441 -0.61233362 ...  0.28530772  0.12465966
 -0.45250063]
The coefficients are: 21.61918866166709


In [164]:
preds = model1.predict(X_test)
print("The MSE for the test set is:", mean_squared_error(y3_test, preds))
print("The R2 score for the test set is:", r2_score(y3_test, preds))


The MSE for the test set is: 432.4553090201663
The R2 score for the test set is: 0.19873476951202773


In [165]:
#Overfitting on training
preds = model1.predict(X_train)
print("The MSE for the test set is:", mean_squared_error(y3_train, preds))
print("The R2 score for the test set is:", r2_score(y3_train, preds))


The MSE for the test set is: 6.348684948569684e-27
The R2 score for the test set is: 1.0


# SGD regression


In [166]:
from sklearn.linear_model import SGDRegressor

model2 = SGDRegressor(max_iter = 2000)
model2 = model2.fit(X_train, y3_train)

print("The coefficients are:",model2.coef_)
print("The coefficients are:",model2.intercept_)

The coefficients are: [-9.62535302e+07 -1.33575036e+06 -1.11179278e+06 ...  8.10098171e+08
  5.36910240e+08  3.75841381e+08]
The coefficients are: [-5.76129076e+08]


In [167]:
preds = model2.predict(X_test)
print("The MSE for the test set is:", mean_squared_error(y3_test, preds))
print("The R2 score for the test set is:", r2_score(y3_test, preds))

The MSE for the test set is: 4.438849514426494e+20
The R2 score for the test set is: -8.22442390021049e+17


In [168]:
#Overfitting on training
preds = model2.predict(X_train)
print("The MSE for the test set is:", mean_squared_error(y3_train, preds))
print("The R2 score for the test set is:", r2_score(y3_train, preds))


The MSE for the test set is: 4.1049207343000447e+20
The R2 score for the test set is: -9.022775090420236e+17


# Pickling the models

In [169]:
#pickle converts the data into serialized format. You can convert it 
#into any language and web app
#You can also store it into AWS cloud as well. You can store it in AWS buckets as well

In [170]:
pickle.dump(model1, open("models/linModel.pkl", "wb")) #wb is write byte mode
pickle.dump(model2, open("models/sdgModel.pkl", "wb")) 

In [171]:
model1 = pickle.load(open("models/linModel.pkl", "rb")) #rb is write byte mode
model2 = pickle.load(open("models/sdgModel.pkl", "rb")) #rb is write byte mode

In [172]:
preds = model1.predict(X_test)
print("The MSE for the test set is:", mean_squared_error(y3_test, preds))
print("The R2 score for the test set is:", r2_score(y3_test, preds))

The MSE for the test set is: 432.4553090201663
The R2 score for the test set is: 0.19873476951202773


In [173]:
preds = model2.predict(X_test)
print("The MSE for the test set is:", mean_squared_error(y3_test, preds))
print("The R2 score for the test set is:", r2_score(y3_test, preds))

The MSE for the test set is: 4.438849514426494e+20
The R2 score for the test set is: -8.22442390021049e+17
