In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
from scipy.stats import randint

# Load the CSV file
data = pd.read_csv('matches.csv')

# Encode interests as dummy variables
interests = data['interests'].str.get_dummies(':')
data = pd.concat([data, interests], axis=1)
data.drop('interests', axis=1, inplace=True)

# Encode categorical variables as dummy variables
categorical_vars = ['gender', 'height', 'skin_complexion', 'body_type', 'hair_size', 'education', 'career', 'religion']
data = pd.get_dummies(data, columns=categorical_vars)

# Drop unnecessary columns
data.drop(['id', 'description'], axis=1, inplace=True)

# Select features and target
X = data.drop('match', axis=1)
y = data['match']

# Ensure all data types are numeric
X = X.apply(pd.to_numeric)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}

# Perform RandomizedSearchCV
clf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best estimator
best_clf = random_search.best_estimator_

# Predict on the test set
y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(report)

# Save the model if accuracy is above a certain threshold (e.g., 70%)
if accuracy > 0.7:
    joblib.dump(best_clf, 'match_model.pkl')
    print("Model saved as 'match_model.pkl'")
else:
    print("Model accuracy is below the threshold, not saving the model.")


Best Parameters: {'bootstrap': True, 'max_depth': 17, 'min_samples_leaf': 7, 'min_samples_split': 6, 'n_estimators': 71}
Accuracy: 0.50
Precision: 0.53
Recall: 0.91
F1 Score: 0.67
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.53      0.91      0.67        11

    accuracy                           0.50        20
   macro avg       0.26      0.45      0.33        20
weighted avg       0.29      0.50      0.37        20

Model accuracy is below the threshold, not saving the model.
