In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("beauty.csv")


In [13]:
df.columns = ['wage', 'exper', 'union', 'goodhlth', 'black', 'female', 'married', 'service', 'educ', 'looks']

In [14]:
df

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks
0,5.73,30,0,1,0,1,1,1,14,4
1,4.28,28,0,1,0,1,1,0,12,3
2,7.96,35,0,1,0,1,0,0,10,4
3,11.57,38,0,1,0,0,1,1,16,3
4,11.42,27,0,1,0,0,1,0,16,3
...,...,...,...,...,...,...,...,...,...,...
1255,1.61,25,0,1,1,1,0,1,12,3
1256,1.68,4,0,1,0,1,1,1,12,2
1257,3.29,35,0,1,1,1,0,1,12,3
1258,2.31,15,0,1,1,1,1,1,10,3


In [20]:
df = df.apply(pd.to_numeric)
df

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks
0,5.73,30,0,1,0,1,1,1,14,4
1,4.28,28,0,1,0,1,1,0,12,3
2,7.96,35,0,1,0,1,0,0,10,4
3,11.57,38,0,1,0,0,1,1,16,3
4,11.42,27,0,1,0,0,1,0,16,3
...,...,...,...,...,...,...,...,...,...,...
1255,1.61,25,0,1,1,1,0,1,12,3
1256,1.68,4,0,1,0,1,1,1,12,2
1257,3.29,35,0,1,1,1,0,1,12,3
1258,2.31,15,0,1,1,1,1,1,10,3


In [16]:
X = df.drop(columns='wage')                         # Dropping wage and cluster
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
X['cluster'] = clusters

In [17]:
y = X['cluster']                                 # Preparing for classification
X = X.drop(columns='cluster')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
models = {                                       # Defining models and parameters
    'LogisticRegression': (
        LogisticRegression(max_iter=1000),
        {'clf__C': [0.01, 0.1, 1, 10]}
    ),
    'RandomForest': (
        RandomForestClassifier(),
        {'clf__n_estimators': [50, 100], 'clf__max_depth': [5, 10, None]}
    ),
    'SVC': (
        SVC(),
        {'clf__C': [0.1, 1, 10], 'clf__kernel': ['linear', 'rbf']}
    )
}

In [27]:
for name, (model, params) in models.items():              # Train, tune and evaluate
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])
    grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred) * 100

    print(f"\n{name} ")
    print("Best Parameters:", grid.best_params_)
    print(f"Accuracy: {accuracy:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


LogisticRegression 
Best Parameters: {'clf__C': 10}
Accuracy: 99.60%
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        73
           1       1.00      1.00      1.00       129
           2       1.00      0.98      0.99        50

    accuracy                           1.00       252
   macro avg       1.00      0.99      0.99       252
weighted avg       1.00      1.00      1.00       252


RandomForest 
Best Parameters: {'clf__max_depth': 5, 'clf__n_estimators': 100}
Accuracy: 99.60%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        73
           1       1.00      1.00      1.00       129
           2       0.98      1.00      0.99        50

    accuracy                           1.00       252
   macro avg       0.99      1.00      0.99       252
weighted avg       1.00      1.00      1.00       252


SVC 
Best Parameters: {'