In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv("beauty (1).csv")

df.columns = ['wage', 'exper', 'union', 'goodhlth', 'black', 'female', 'married', 'service', 'educ', 'looks']
df = df.apply(pd.to_numeric)

df.head()

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks
0,5.73,30,0,1,0,1,1,1,14,4
1,4.28,28,0,1,0,1,1,0,12,3
2,7.96,35,0,1,0,1,0,0,10,4
3,11.57,38,0,1,0,0,1,1,16,3
4,11.42,27,0,1,0,0,1,0,16,3


In [None]:
scaler01 = StandardScaler()
X_scaled = scaler01.fit_transform(df)

kmeansCLS = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeansCLS.fit_predict(X_scaled)

df.head()

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks,cluster
0,5.73,30,0,1,0,1,1,1,14,4,0
1,4.28,28,0,1,0,1,1,0,12,3,0
2,7.96,35,0,1,0,1,0,0,10,4,0
3,11.57,38,0,1,0,0,1,1,16,3,2
4,11.42,27,0,1,0,0,1,0,16,3,1


In [None]:
X = df.drop(columns='cluster')
y = df['cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [6]:
models_params = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['lbfgs', 'liblinear']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }
}


In [None]:
results = {}

for name, mp in models_params.items():
    print(f"\nTraining {name}...")
    clf = GridSearchCV(mp['model'], mp['params'], cv=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\nBest Params:", clf.best_params_)
    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Repot:\n", classification_report(y_test, y_pred))

    results[name] = {
        'best_params': clf.best_params_,
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred, output_dict=True)
    }


Training LogisticRegression...

Best Parameters: {'C': 10, 'solver': 'lbfgs'}

Accuracy: 0.9920634920634921

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99        84
           1       0.99      1.00      1.00       102
           2       1.00      0.98      0.99        66

    accuracy                           0.99       252
   macro avg       0.99      0.99      0.99       252
weighted avg       0.99      0.99      0.99       252


Training RandomForest...

Best Parameters: {'max_depth': None, 'n_estimators': 100}

Accuracy: 0.9563492063492064

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        84
           1       0.93      1.00      0.96       102
           2       1.00      0.85      0.92        66

    accuracy                           0.96       252
   macro avg       0.96      0.95      0.95       252
weighted avg       