In [1]:
#required Libraries
import pandas as pd
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [74]:
#Preprocess training set ### 
#splits gender and image name out of the training data and forms independent dataframe out of them
data=pd.read_csv('../Datasets/celeba/labels.csv')

labelsTrain = data["\timg_name\tgender\tsmiling"].str.split(pat="\t", n=-1, expand=True)
labelsTrain.drop(columns =[0,3], inplace = True)
labelsTrain.columns = ["img_name"  , "gender"]
labelsTrain = labelsTrain.astype({'gender': 'int32'})
labelsTrain = labelsTrain.astype({'img_name': 'string'})
labelsTrain["gender"] = labelsTrain["gender"].replace(-1, 0)
#Preprocess testing set###
#splits gender and image name out of the testing data and forms independent dataframe out of them
data=pd.read_csv('../Datasets/celeba_test/labels.csv')

labelsTest = data["\timg_name\tgender\tsmiling"].str.split(pat="\t", n=-1, expand=True)
labelsTest.drop(columns =[0,3], inplace = True)
labelsTest.columns = ["img_name"  , "gender"]
labelsTest = labelsTest.astype({'gender': 'int32'})
labelsTest = labelsTest.astype({'img_name': 'string'})
labelsTest["gender"] = labelsTest["gender"].replace(-1, 0)

In [77]:
#Load training set################################
#reads image data from training set ,converts to grayscale, flattens it , and stores it into imageTrain array 
imagesTrain = np.zeros((5000, 218, 178))

for i in range(0,5000):
    image = cv.imread('../Datasets/celeba/img/' + labelsTrain["img_name"][i])
    image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    imagesTrain[i] = image
    
imagesTrain = imagesTrain.reshape(5000,38804)    
imagesTrain = pd.DataFrame(imagesTrain) 


#Load Testing set################################
#reads image data from Testing set ,converts to grayscale, flattens it , and stores it into imageTrain array 
imagesTest = np.zeros((1000, 218, 178))

for i in range(0,1000):
    image = cv.imread('../Datasets/celeba_test/img/' + labelsTest["img_name"][i])
    image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    imagesTest[i] = image
    
imagesTest = imagesTest.reshape(1000,38804)   

imagesTest = pd.DataFrame(imagesTest) 

In [78]:
#scale coversion
scaler = MinMaxScaler()

imagesTrain_scaled = scaler.fit_transform(imagesTrain)
imagesTest_scaled = scaler.transform(imagesTest)

In [79]:
#PCA conversion
pca = PCA(n_components = 500)

imagesTrain_pca = pca.fit_transform(imagesTrain_scaled)
imagesTest_pca = pca.transform(imagesTest_scaled)

imagesTrain_pca = pd.DataFrame(imagesTrain_pca) 
imagesTest_pca = pd.DataFrame(imagesTest_pca) 

In [80]:
pca.explained_variance_ratio_.sum()

0.9586751736425905

In [81]:
#stores pca transformed image data into x_train and x_test respectively then joins labelsTrain and x_train into singular dataframe
x_train = pd.DataFrame(imagesTrain_pca) 
x_test = pd.DataFrame(imagesTest_pca)
x_train = pd.concat([labelsTrain,x_train],axis=1, join='inner')

In [86]:
#logistic regression#######
model = LogisticRegression(C=0.012742749857031334, max_iter=50, solver='sag')
model.fit(x_train.iloc[:,2:], labelsTrain['gender'])
model.score(x_test,labelsTest['gender'])



0.895

In [146]:
# sets hyper parameter testing for cross validation
hyper_params = {
    'max_iter' : [25,50,100],
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'penalty' : ['l1','l2','elasticnet','none'],
    'C': np.logspace(-4,4,20)
    
    
}

In [147]:
#defines crossvalidation function for logistic regression
model = LogisticRegression()
grid=GridSearchCV(model , param_grid=hyper_params , cv=10 , n_jobs=-1 , verbose = True)

In [148]:
#starts cross validation
grid.fit(x_train.iloc[:,2:],labelsTrain['gender'])

Fitting 10 folds for each of 1200 candidates, totalling 12000 fits


5400 fits failed out of a total of 12000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\X99S5\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\X99S5\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\X99S5\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Sol

In [152]:
#cross valdiation best estimator
grid.best_estimator_

In [150]:
#cross validation best score
grid.best_score_

0.8906000000000001