In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_decision_regions
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

%matplotlib inline

In [17]:
pairs_with_profiles = pd.read_csv('C:/Users/maksim/QuickDatingModel/Data/pairs_with_profiles.csv')
filtered_profiles = pd.read_csv('C:/Users/maksim/QuickDatingModel/Data/filtered_profiles.csv')

In [18]:
pairs_with_profiles.head()

Unnamed: 0.1,Unnamed: 0,age_diff,education_diff,drinks_same,smokes_same,body_type_same,match,match_probability,user1,user2,sex_user1,sex_user2
0,0,19.0,0,0,0,1,0,0.348889,55676.0,16029.0,0,1
1,1,25.0,0,0,0,1,0,0.055556,55676.0,59533.0,0,1
2,2,3.0,0,0,0,1,1,0.74,55676.0,49474.0,0,1
3,3,8.0,0,0,0,0,0,0.578889,55676.0,57955.0,0,1
4,4,6.0,0,0,0,1,1,0.71,55676.0,16774.0,0,1


In [20]:
pairs_with_profiles.shape

(244375, 12)

Чистим и делим данные

In [21]:
features = ['age_diff', 'education_diff', 'drinks_same', 'smokes_same', 'body_type_same']
X = pairs_with_profiles[features]
y = pairs_with_profiles['match_probability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Линейная модель

In [22]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.004080234516564435
R-squared: 0.9382559999418978


Логистическая модель

In [23]:
X = pairs_with_profiles[features]
y = pairs_with_profiles['match']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred_prob = logistic_model.predict_proba(X_test)[:, 1]

log_loss_value = log_loss(y_test, y_pred_prob)

print(f"Log Loss: {log_loss_value}")

Log Loss: 0.058262902214948384


array([9.98306281e-01, 1.95836542e-07, 9.99999965e-01, ...,
       1.00000000e+00, 9.99999392e-01, 2.60899591e-31])

In [24]:
test_df = pd.DataFrame(pairs_with_profiles.loc[1:1])

In [25]:
test_df = test_df[['age_diff', 'education_diff', 'drinks_same', 'smokes_same', 'body_type_same']]

In [26]:
linear_model.predict(test_df)

array([0.19169843])

In [27]:
# На вход id пользователя, на выходе: таблица с вероятностями метчей со всеми пользователями
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict_matches_reg(user_id, profiles_df, model):
    user_profile = profiles_df.loc[user_id]
    data_for_prediction = pd.DataFrame(columns=['id', 'age_diff', 'education_diff', 'drinks_same', 'smokes_same', 'body_type_same'])

    rows = []
    for other_user_id in profiles_df.index:
        if other_user_id != user_id:
            age_diff = abs(user_profile['age'] - profiles_df.loc[other_user_id, 'age'])
            education_diff = abs(user_profile['education_'] - profiles_df.loc[other_user_id, 'education_'])
            drinks_same = int(user_profile['drinks_'] == profiles_df.loc[other_user_id, 'drinks_'])
            smokes_same = int(user_profile['smokes_'] == profiles_df.loc[other_user_id, 'smokes_'])
            body_type_same = int(user_profile['body_type_'] == profiles_df.loc[other_user_id, 'body_type_'])
            
            rows.append({
                'id': other_user_id,
                'age_diff': age_diff,
                'education_diff': education_diff,
                'drinks_same': drinks_same,
                'smokes_same': smokes_same,
                'body_type_same': body_type_same
            })


    data_for_prediction = pd.DataFrame(rows)

    X = data_for_prediction.drop(columns='id')
    data_for_prediction['match_probability'] = sigmoid(model.predict(X))
    
    return data_for_prediction[['id', 'match_probability']]

def predict_matches_clf(user_id, profiles_df, model):
    user_profile = profiles_df.loc[user_id]
    data_for_prediction = pd.DataFrame(columns=['id', 'age_diff', 'education_diff', 'drinks_same', 'smokes_same', 'body_type_same'])

    rows = []
    for other_user_id in profiles_df.index:
        if other_user_id != user_id:
            age_diff = abs(user_profile['age'] - profiles_df.loc[other_user_id, 'age'])
            education_diff = abs(user_profile['education_'] - profiles_df.loc[other_user_id, 'education_'])
            drinks_same = int(user_profile['drinks_'] == profiles_df.loc[other_user_id, 'drinks_'])
            smokes_same = int(user_profile['smokes_'] == profiles_df.loc[other_user_id, 'smokes_'])
            body_type_same = int(user_profile['body_type_'] == profiles_df.loc[other_user_id, 'body_type_'])
            
            rows.append({
                'id': other_user_id,
                'age_diff': age_diff,
                'education_diff': education_diff,
                'drinks_same': drinks_same,
                'smokes_same': smokes_same,
                'body_type_same': body_type_same
            })


    data_for_prediction = pd.DataFrame(rows)

    X = data_for_prediction.drop(columns='id')
    data_for_prediction['match_probability'] = model.predict_proba(X)[:, 1]
    # data_for_prediction['match_probability'] = data_for_prediction['match_probability'].round(6)
    
    return data_for_prediction[['id', 'match_probability']]

In [28]:
user_id = 1
predicted_matches = predict_matches_reg(user_id, filtered_profiles, linear_model)
predicted_matches.sort_values(by='match_probability',ascending=False )

Unnamed: 0,id,match_probability
862,863,0.737358
861,862,0.737358
908,909,0.737358
992,993,0.737358
793,794,0.732640
...,...,...
81,82,0.485866
624,625,0.482369
416,417,0.482369
396,397,0.476325


In [12]:
user_id = 1
predicted_matches = predict_matches_clf(user_id, filtered_profiles, logistic_model)
predicted_matches.head(10)

Unnamed: 0,id,match_probability
0,0,0.9999998
1,2,0.9998551
2,3,1.0
3,4,1.0
4,5,0.9999998
5,6,4.89061e-18
6,7,1.0
7,8,0.9999842
8,9,0.002762631
9,10,0.9452937


In [13]:
predicted_matches.shape

(999, 2)