In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import OrdinalEncoder

In [None]:
male = ['Tom Holland', 'Robert Downey Jr', 'Jason Momoa', 'Hugh Jackman', 'Henry Cavil', 'Dwayne Johnson',
        'Chris Pratt', 'Chris Hemsworth', 'Andy Samberg']

In [None]:
path = '/content/drive/MyDrive/DataMining/DataMining/dataset/training_celeb100x100.csv'
df = pd.read_csv(path, header=0)
df = df.fillna(0)
df['gender'] = 0
df.loc[df['celeb'].isin(male), 'gender'] = 1
df.head(5)

Unnamed: 0,celeb,r1c1,r1c2,r1c3,r1c4,r1c5,r1c6,r1c7,r1c8,r1c9,...,r100c92,r100c93,r100c94,r100c95,r100c96,r100c97,r100c98,r100c99,r100c100,gender
0,Alexandra Daddario,0.117725,0.113804,0.125569,0.125529,0.113765,0.098078,0.086314,0.101569,0.10549,...,0.460549,0.374275,0.452706,0.274784,0.361059,0.425255,0.412627,0.470549,0.487373,0
1,Alexandra Daddario,0.799255,0.804353,0.803216,0.795725,0.812627,0.795843,0.828,0.863333,0.274,...,0.776314,0.812471,0.816392,0.792863,0.793176,0.789255,0.785333,0.781412,0.781412,0
2,Alexandra Daddario,0.917922,0.917922,0.918235,0.925216,0.925216,0.915451,0.920941,0.902471,0.889569,...,0.295098,0.388588,0.318,0.314549,0.280902,0.34051,0.400824,0.310314,0.448157,0
3,Alexandra Daddario,0.187647,0.187647,0.187686,0.187686,0.187686,0.191569,0.193059,0.199765,0.195843,...,0.57749,0.589255,0.604941,0.620627,0.620627,0.620627,0.603804,0.586196,0.543059,0
4,Alexandra Daddario,0.948392,0.937804,0.936667,0.917961,0.898549,0.878667,0.87102,0.87502,0.856157,...,0.091059,0.176627,0.16098,0.212667,0.158314,0.110667,0.086353,0.090275,0.16902,0


In [None]:
x = df.iloc[:, 1:-1]
y = df.iloc[:,0]
y_gender = df.iloc[:, -1]

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")

    elif train == False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
pipeline = Pipeline([
    ('min_max_scaler', MinMaxScaler()),
    ('std_scaler', StandardScaler())
])

# X_train, X_test, y_train, y_test = train_test_split(x, y_gender, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(x, y_gender, test_size=0.3, random_state=42)

In [None]:
y_train.value_counts()

gender
0    1257
1    1077
Name: count, dtype: int64

In [None]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [None]:
pca = PCA(0.95)

scaler = StandardScaler()

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
pca.n_components_

429

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [10],
              'gamma': [0.01, 0.001],
              'kernel': ['rbf']}

# param_grid = {'C': [0.01, 0.1, 0.5, 1, 10, 100],
#               'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001],
#               'kernel': ['rbf', 'poly', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(X_train, y_train)
best_params = grid.best_params_
print(f"Best params: {best_params}")

svm_clf = SVC(**best_params)
svm_clf.fit(X_train, y_train)

print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Train Result:
Accuracy Score: 99.57%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.996813     0.994439  0.995716     0.995626      0.995718
recall        0.995227     0.996286  0.995716     0.995756      0.995716
f1-score      0.996019     0.995362  0.995716     0.995690      0.995716
support    1257.000000  1077.000000  0.995716  2334.000000   2334.000000
_______________________________________________
Confusion Matrix: 
 [[1251    6]
 [   4 1073]]

Test Result:
Accuracy Score: 81.62%
_______________________________________________
CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.808550    0.825054  0.816184     0.816802      0.816547
recall       0.843023    0.787629  0.816184     0.815326      0.81

In [None]:
test = pd.read_csv('/content/drive/MyDrive/DataMining/DataMining/dataset/unlabelledtest_celeb100x100.csv', header=0)
test.head(5)

Unnamed: 0,r1c1,r1c2,r1c3,r1c4,r1c5,r1c6,r1c7,r1c8,r1c9,r1c10,...,r100c91,r100c92,r100c93,r100c94,r100c95,r100c96,r100c97,r100c98,r100c99,r100c100
0,0.800863,0.804784,0.804784,0.800863,0.804784,0.800863,0.808706,0.808706,0.815098,0.799725,...,0.742314,0.73498,0.718784,0.718784,0.71051,0.711961,0.706588,0.714431,0.710941,0.719647
1,0.368784,0.253922,0.20702,0.298627,0.408235,0.37302,0.247451,0.206784,0.275922,0.399608,...,1.0,1.0,1.0,1.0,1.0,1.0,0.992157,0.99651,0.997686,0.862157
2,0.091059,0.091059,0.09498,0.093843,0.093843,0.097804,0.09898,0.102902,0.102941,0.102941,...,0.098392,0.067216,0.08702,0.038863,0.189294,0.407412,0.407098,0.463373,0.510353,0.266196
3,0.157529,0.137882,0.140627,0.132784,0.133647,0.117961,0.121725,0.122157,0.136353,0.140549,...,0.095647,0.090863,0.087804,0.091725,0.090235,0.090235,0.090235,0.090235,0.090235,0.086314
4,0.383373,0.383373,0.424039,0.478824,0.471255,0.451647,0.366235,0.337686,0.363725,0.355882,...,0.471059,0.603529,0.792157,0.722706,0.463569,0.593137,0.641294,0.566039,0.577216,0.546784


In [None]:
X_new = pipeline.transform(test)

In [None]:
X_new = pca.transform(X_new)

In [None]:
X_new.shape

(190, 450)

In [None]:
result = svm_clf.predict(X_new)

results = pd.DataFrame(result, columns=['binary'])
results['gender'] = results['binary'].apply(lambda x: 'male' if x == 1 else 'female')

results.to_csv("results_finetune_svm.csv")