In [1]:
# SPARQL query to extract lead transitions
query = """
PREFIX crm: <http://www.example.org/crm_detailed_ontology#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?lead_id ?lead_name ?params_id ?old_status_id ?old_status_name ?new_status_id ?new_status_name ?change_date ?item_id
WHERE {
    ?params_class rdf:type crm:ParamsClass .
    ?lead crm:hasItemFromLead ?item .


    ?lead crm:hasID ?lead_id .
    ?lead crm:hasName ?lead_name .
    ?params_class crm:hasID ?params_id .

    ?params_class crm:belongFromParamsClassToOldStatus ?old_status .

    ?old_status crm:hasID ?old_status_id .
    ?old_status crm:hasName ?old_status_name .

    ?params_class crm:belongFromParamsClassToNewStatus ?new_status .
    ?new_status crm:hasID ?new_status_id .

    ?new_status crm:hasName ?new_status_name .

    ?item crm:hasID ?item_id .
    ?item crm:hasDateModified ?change_date .

}
ORDER BY ?lead_id ?change_date
"""

In [2]:
query_features = """
PREFIX crm: <http://www.example.org/crm_detailed_ontology#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?lead_id ?lead_name ?created_at ?responsible_user_id (COUNT(?item) AS ?interaction_count)
WHERE {
    ?lead rdf:type crm:Lead .
    ?lead crm:hasID ?lead_id .
    ?lead crm:hasName ?lead_name .
    ?lead crm:hasCreatedAt ?created_at .
    OPTIONAL { ?lead crm:hasResponsibleUserID ?responsible_user_id . }
    OPTIONAL { ?lead crm:hasItemFromLead ?item . }
}
GROUP BY ?lead_id ?lead_name ?created_at ?responsible_user_id
"""

In [3]:
# Import necessary libraries
from collections import defaultdict
from datetime import datetime, timedelta
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from rdflib.plugins.sparql import prepareQuery

# Load the RDF graph
g = Graph()
g.parse('crm_graph.rdf', format='xml')

# Define the namespace
CRM = Namespace("http://www.example.org/crm_detailed_ontology#")
g.bind("crm", CRM)

In [4]:
# Execute the status transitions query
results = g.query(query)

# Process the results
transitions_data = []
for row in results:
    lead_id = row.lead_id.toPython()
    lead_name = row.lead_name.toPython()
    params_id = row.params_id.toPython()
    old_status_id = row.old_status_id.toPython()
    old_status_name = row.old_status_name.toPython()
    new_status_id = row.new_status_id.toPython()
    new_status_name = row.new_status_name.toPython()
    change_date = row.change_date.toPython()
    item_id = row.item_id.toPython()
    
    
    transitions_data.append({
        'lead_id': lead_id,
        'lead_name': lead_name,
        'params_id': params_id,
        'old_status_id': old_status_id,
        'old_status_name': old_status_name,
        'new_status_id': new_status_id,
        'new_status_name': new_status_name,
        'change_date': change_date,
        'item_id': item_id
    })

# Similarly execute and process the features query
results_features = g.query(query_features)

features_data = []
for row in results_features:
    lead_id = row.lead_id.toPython()
    lead_name = row.lead_name.toPython()
    created_at = row.created_at.toPython()
    responsible_user_id = row.responsible_user_id.toPython() if row.responsible_user_id else None
    interaction_count = int(row.interaction_count)
    
    features_data.append({
        'lead_id': lead_id,
        'lead_name': lead_name,
        'created_at': created_at,
        'responsible_user_id': responsible_user_id,
        'interaction_count': interaction_count
    })

In [5]:
import pandas as pd

# Convert lists to DataFrames
df_transitions = pd.DataFrame(transitions_data)
df_features = pd.DataFrame(features_data)

# Merge data on 'lead_id'
df_merged = pd.merge(df_transitions, df_features, on=['lead_id', 'lead_name'], how='left')

# Ensure 'change_date' is in datetime format
df_merged['change_date'] = pd.to_datetime(df_merged['change_date'])
df_merged['created_at'] = pd.to_datetime(df_merged['created_at'])

In [6]:
df_transitions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60388 entries, 0 to 60387
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   lead_id          60388 non-null  int64         
 1   lead_name        60388 non-null  object        
 2   params_id        60388 non-null  int64         
 3   old_status_id    60388 non-null  int64         
 4   old_status_name  60388 non-null  object        
 5   new_status_id    60388 non-null  int64         
 6   new_status_name  60388 non-null  object        
 7   change_date      60388 non-null  datetime64[ns]
 8   item_id          60388 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 4.1+ MB


In [7]:
df_merged.head()

Unnamed: 0,lead_id,lead_name,params_id,old_status_id,old_status_name,new_status_id,new_status_name,change_date,item_id,created_at,responsible_user_id,interaction_count
0,24050537,Proskater.Ru,278,41159701,возвращение в работу,34649023,Клиент квалифицирован,2020-10-07 16:20:20,47620073,1970-01-01 00:00:01.602076820,11089494,200
1,24050537,Proskater.Ru,278,41159701,возвращение в работу,34649023,Клиент квалифицирован,2020-10-07 16:20:20,47620074,1970-01-01 00:00:01.602076820,11089494,200
2,24050537,Proskater.Ru,278,41159701,возвращение в работу,34649023,Клиент квалифицирован,2020-10-07 16:20:20,47620075,1970-01-01 00:00:01.602076820,11089494,200
3,24050537,Proskater.Ru,278,41159701,возвращение в работу,34649023,Клиент квалифицирован,2020-10-07 16:20:20,47620076,1970-01-01 00:00:01.602076820,11089494,200
4,24050537,Proskater.Ru,278,41159701,возвращение в работу,34649023,Клиент квалифицирован,2020-10-07 16:20:20,47620077,1970-01-01 00:00:01.602076820,11089494,200


In [8]:
df_merged.describe()

Unnamed: 0,lead_id,params_id,old_status_id,new_status_id,change_date,item_id,created_at,responsible_user_id,interaction_count
count,60388.0,60388.0,60388.0,60388.0,60388,60388.0,60388,60388.0,60388.0
mean,26986850.0,264.870968,29321820.0,31608600.0,2023-10-21 21:50:59.080081920,46365600.0,1970-01-01 00:00:01.665628697,8692636.0,89.817248
min,24050540.0,12.0,142.0,142.0,2020-10-07 16:20:20,1.0,1970-01-01 00:00:01.602076820,7737097.0,14.0
25%,25690820.0,149.0,34639410.0,143.0,2023-05-18 15:19:41,47618190.0,1970-01-01 00:00:01.629795367,7737097.0,45.0
50%,27736090.0,307.0,34639410.0,34649890.0,2024-02-20 05:10:50.500000,47619810.0,1970-01-01 00:00:01.679552085,7737097.0,82.0
75%,27925200.0,376.0,34649030.0,52893910.0,2024-08-22 12:20:54,47620050.0,1970-01-01 00:00:01.685633521,11089490.0,93.0
max,28788200.0,490.0,52893910.0,52893910.0,2024-12-19 16:17:26,47620330.0,1970-01-01 00:00:01.714049212,11089490.0,200.0
std,1667223.0,137.962507,16724710.0,21389870.0,,6924573.0,,1446659.0,60.319156


In [9]:
# Define the time window in days
time_window = pd.Timedelta(days=7)

# Sort transitions by lead_id and change_date
df_merged.sort_values(by=['lead_id', 'change_date'], inplace=True)

# Initialize labels
df_merged['label'] = 0

# Group by lead
for lead_id, group in df_merged.groupby('lead_id'):
    transitions = group.reset_index()
    df_sorted_transitions = transitions.sort_values(by=['change_date'], ascending=False)
    # print(len(df_sorted_transitions['item_id'].sort_values().unique()))
    # df_sorted_transitions.to_csv('out.csv', sep='\t')
    # break
    counter = 0
    for i in range(len(transitions)):
        current_transition = transitions.loc[i]
        current_date = current_transition['change_date']
        current_new_status = current_transition['new_status_id']
        if i + 1 < len(transitions):
            next_transition = transitions.loc[i + 1]
            next_date = next_transition['change_date']
            next_new_status = next_transition['new_status_id']
            
            if next_new_status != current_new_status:
                counter+=1
                
                time_diff = next_date - current_date
                if time_diff <= time_window:
                    # Lead moved to next status within time window
                    df_merged.loc[current_transition['index'], 'label'] = 1
                else:
                    df_merged.loc[current_transition['index'], 'label'] = 0
        else:
            # No subsequent transition; label depends on business logic
            df_merged.loc[current_transition['index'], 'label'] = 0
    print("current_new_status", counter)
    

current_new_status 5129
current_new_status 1709
current_new_status 1319
current_new_status 2009
current_new_status 659
current_new_status 599
current_new_status 839
current_new_status 209
current_new_status 809
current_new_status 449
current_new_status 1139
current_new_status 269
current_new_status 1829
current_new_status 929
current_new_status 839
current_new_status 1019
current_new_status 2129


In [10]:
# Time since lead creation
df_merged['time_since_creation'] = (df_merged['change_date'] - df_merged['created_at']).dt.total_seconds() / (3600*24)

# Convert categorical variables
df_merged['responsible_user_id'] = df_merged['responsible_user_id'].astype('category')
df_merged['new_status_id'] = df_merged['new_status_id'].astype('category')

# For time since last status change, we can calculate the difference between current and previous change dates
df_merged['time_since_last_change'] = df_merged.groupby('lead_id')['change_date'].diff().dt.total_seconds() / (3600*24)
# Fill NaN with time since creation for the first transition
df_merged['time_since_last_change'].fillna(df_merged['time_since_creation'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['time_since_last_change'].fillna(df_merged['time_since_creation'], inplace=True)


In [11]:
# Convert categorical features to numerical codes
df_merged['responsible_user_code'] = df_merged['responsible_user_id'].cat.codes
df_merged['new_status_code'] = df_merged['new_status_id'].cat.codes

# Select features and label
feature_columns = [
    'time_since_creation',
    'time_since_last_change',
    'interaction_count',
    'responsible_user_code',
    'new_status_code',
    # Add other features if necessary
]
X = df_merged[feature_columns]
y = df_merged['label']

In [17]:
# Import necessary libraries
import numpy as np
import pandas as pd
from rdflib import Namespace

# Ensure IDs are strings
df_merged['lead_id'] = df_merged['lead_id'].astype(str)
df_merged['old_status_id'] = df_merged['old_status_id'].astype(str)
df_merged['new_status_id'] = df_merged['new_status_id'].astype(str)
df_merged['responsible_user_id'] = df_merged['responsible_user_id'].astype(str)

# Define the CRM namespace
CRM = Namespace("http://www.example.org/crm_detailed_ontology#")

# Generate URIs for entities
df_merged['lead_uri'] = df_merged['lead_id'].apply(lambda x: f"Lead{x}")
df_merged['old_status_uri'] = df_merged['old_status_id'].apply(lambda x: f"Status{x}")
df_merged['new_status_uri'] = df_merged['new_status_id'].apply(lambda x: f"Status{x}")
df_merged['responsible_user_uri'] = df_merged['responsible_user_id'].apply(lambda x: f"User{x}" if x != 'nan' else None)

# Build triples from the data
triples = []

for idx, row in df_merged.iterrows():
    lead_uri = row['lead_uri']
    old_status_uri = row['old_status_uri']
    new_status_uri = row['new_status_uri']
    responsible_user_uri = row['responsible_user_uri']

    # Triple for old status
    triples.append((lead_uri, 'changedFromStatus', old_status_uri))

    # Triple for new status
    triples.append((lead_uri, 'changedToStatus', new_status_uri))

    # Triple for current status
    triples.append((lead_uri, 'hasStatus', new_status_uri))

    # Triple for responsible user (if available)
    if responsible_user_uri is not None and responsible_user_uri != 'Usernan':
        triples.append((lead_uri, 'hasResponsibleUser', responsible_user_uri))

# Optionally, include interaction data or other features as triples

In [18]:
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

# Convert the list of triples to a NumPy array
triples_array = np.array(triples, dtype=str)

# Create a TriplesFactory from the constructed triples
tf = TriplesFactory.from_labeled_triples(
    triples=triples_array
)

# Split the TriplesFactory into training and testing splits
training_tf, testing_tf = tf.split([0.8, 0.2])

# Train the embedding model using PyKEEN's pipeline
result = pipeline(
    training=training_tf,
    testing=testing_tf,
    model='TransE',
    training_kwargs={
        'num_epochs': 100,
        'batch_size': 256
    },
    optimizer='adam',
    optimizer_kwargs={
        'lr': 0.001
    },
    random_seed=42,
    device='cpu'
)

using automatically assigned random_state=539197513
Training epochs on cpu: 100%|██████████| 100/100 [01:25<00:00,  1.17epoch/s, loss=0.477, prev_loss=0.522]
Evaluating on cpu: 100%|██████████| 123/123 [00:00<00:00, 546triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.27s seconds


In [35]:
# Retrieve evaluation results
metric_results = result.metric_results

# Extract metrics
mr = metric_results.get_metric('adjusted_mean_rank')
mrr = metric_results.get_metric('adjusted_mean_reciprocal_rank')
hits_at_1 = metric_results.get_metric('hits_at_1')
hits_at_3 = metric_results.get_metric('hits_at_3')
hits_at_10 = metric_results.get_metric('hits_at_10')

# Display the metrics
print(f"Mean Rank (MR): {mr:.2f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Hits@1: {hits_at_1:.2f}")
print(f"Hits@3: {hits_at_3:.2f}")
print(f"Hits@10: {hits_at_10:.2f}")

Mean Rank (MR): 0.32
Mean Reciprocal Rank (MRR): 0.2511
Hits@1: 0.00
Hits@3: 0.76
Hits@10: 0.97


In [36]:
from pykeen.evaluation import RankBasedEvaluator

# Define an evaluator
evaluator = RankBasedEvaluator(filtered=True)

# Evaluate the model on the test set
metric_results = evaluator.evaluate(
    model=result.model,
    mapped_triples=testing_tf.mapped_triples,
    additional_filter_triples=[training_tf.mapped_triples],
)

# Extract metrics
mr = metric_results.get_metric('adjusted_mean_rank')
mrr = metric_results.get_metric('adjusted_mean_reciprocal_rank')
hits_at_1 = metric_results.get_metric('hits_at_1')
hits_at_3 = metric_results.get_metric('hits_at_3')
hits_at_10 = metric_results.get_metric('hits_at_10')

# Display the metrics
print(f"Mean Rank (MR): {mr:.2f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Hits@1: {hits_at_1:.2f}")
print(f"Hits@3: {hits_at_3:.2f}")
print(f"Hits@10: {hits_at_10:.2f}")

Evaluating on cpu: 100%|██████████| 123/123 [00:00<00:00, 1.08ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.13s seconds


Mean Rank (MR): 0.32
Mean Reciprocal Rank (MRR): 0.2511
Hits@1: 0.00
Hits@3: 0.76
Hits@10: 0.97


In [30]:
# After training, get the entity embeddings
entity_embedding_model = result.model.entity_representations[0]
entity_embeddings = entity_embedding_model().detach().cpu().numpy()

# Get the mapping from entity labels to IDs
entity_to_id = training_tf.entity_to_id
id_to_entity = {idx: entity for entity, idx in entity_to_id.items()}

# Create a dictionary mapping entity labels to embeddings
entity_to_embedding = {
    id_to_entity[idx]: embedding
    for idx, embedding in enumerate(entity_embeddings)
    if idx in id_to_entity
}

embedding_dimension = entity_embeddings.shape[1]

# Function to get the embedding for a given entity URI
def get_entity_embedding(entity_uri):
    embedding = entity_to_embedding.get(entity_uri)
    if embedding is not None:
        return embedding
    else:
        return np.zeros(embedding_dimension)

# Add lead embeddings to the DataFrame
df_merged['lead_embedding'] = df_merged['lead_uri'].apply(get_entity_embedding)

# Add status embeddings to the DataFrame
df_merged['status_embedding'] = df_merged['new_status_uri'].apply(get_entity_embedding)

# Encode categorical variables (if not already done)
df_merged['responsible_user_id'] = df_merged['responsible_user_id'].astype('category')
df_merged['responsible_user_code'] = df_merged['responsible_user_id'].cat.codes

# Ensure feature columns are present
feature_columns = [
    'time_since_creation',
    'time_since_last_change',
    'interaction_count',
    'responsible_user_code',
    # Add other features as needed
]

# Combine embeddings and features
def combine_features(row):
    features = []
    # Lead embedding
    features.extend(row['lead_embedding'])
    # Status embedding
    features.extend(row['status_embedding'])
    # Other numeric features
    numeric_features = row[feature_columns].values.astype(float).tolist()
    features.extend(numeric_features)
    return features

# Apply combine_features to each row
df_merged['combined_features'] = df_merged.apply(combine_features, axis=1)

# Prepare the final feature matrix X and target vector y
X = np.vstack(df_merged['combined_features'].values)
y = df_merged['label'].values.astype(int)

In [33]:
X

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.85426808e+04, 2.00000000e+02, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 2.00000000e+02, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 2.00000000e+02, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.00000000e+01, 2.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.00000000e+01, 2.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.00000000e+01, 2.00000000e+00]],
      shape=(60388, 104))

In [32]:
y

array([0, 0, 0, ..., 0, 1, 0], shape=(60388,))

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)
import numpy as np
import pandas as pd

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Инициализация и обучение классификатора
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Предсказание вероятностей и меток на тестовой выборке
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Вычисление метрик
roc_auc = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

# Вывод метрик с автоматическим анализом
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Дополнительный отчет о классификации
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

ROC-AUC Score: 0.8039
Accuracy: 0.7843
Precision: 0.7645
Recall (Sensitivity): 0.5833
F1 Score: 0.6617

Confusion Matrix:
[[6925  785]
 [1820 2548]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      7710
           1       0.76      0.58      0.66      4368

    accuracy                           0.78     12078
   macro avg       0.78      0.74      0.75     12078
weighted avg       0.78      0.78      0.78     12078



In [45]:
def predict_transition_probability(deal, entity_to_embedding, model,
                                   responsible_user_category_mapping,
                                   new_status_category_mapping,
                                   embedding_dimension):
    """
    Predicts the probability of a lead transitioning to the next stage in the sales funnel.
    
    Parameters:
        deal (dict): A dictionary containing the deal's features.
        entity_to_embedding (dict): A mapping from entity URIs to embeddings.
        model: The trained machine learning model.
        responsible_user_category_mapping (dict): Mapping from responsible_user_id to codes.
        new_status_category_mapping (dict): Mapping from status_id to codes.
        embedding_dimension (int): Dimension of the embeddings.
        
    Returns:
        float: The predicted probability of transitioning to the next stage.
    """
    # Extract necessary fields from the deal
    lead_id = str(deal.get('lead_id'))
    current_status_id = str(deal.get('current_status_id'))
    responsible_user_id = str(deal.get('responsible_user_id', 'UnknownUser'))
    interaction_count = deal.get('interaction_count', 0)
    created_at = pd.to_datetime(deal.get('created_at'))
    change_date = pd.to_datetime(deal.get('change_date', datetime.now()))
    last_change_date = pd.to_datetime(deal.get('last_change_date', created_at))
    
    # Compute features similar to the training data
    
    # Time since creation (in days)
    time_since_creation = (change_date - created_at).total_seconds() / (3600 * 24)
    
    # Time since last change
    time_since_last_change = (change_date - last_change_date).total_seconds() / (3600 * 24)
    
    # Encode responsible_user_id
    if responsible_user_id in responsible_user_category_mapping:
        responsible_user_code = responsible_user_category_mapping[responsible_user_id]
    else:
        responsible_user_code = max(responsible_user_category_mapping.values()) + 1  # Assign a new code
    
    # Encode current_status_id
    if current_status_id in new_status_category_mapping:
        new_status_code = new_status_category_mapping[current_status_id]
    else:
        new_status_code = max(new_status_category_mapping.values()) + 1  # Assign a new code
    
    # Generate entity URIs
    lead_uri = f"Lead{lead_id}"
    status_uri = f"Status{current_status_id}"
    
    # Retrieve embeddings
    def get_entity_embedding(entity_uri):
        embedding = entity_to_embedding.get(entity_uri)
        if embedding is not None:
            return embedding
        else:
            return np.zeros(embedding_dimension)
    
    lead_embedding = get_entity_embedding(lead_uri)
    status_embedding = get_entity_embedding(status_uri)
    
    # Combine features
    feature_columns = [
        time_since_creation,
        time_since_last_change,
        interaction_count,
        responsible_user_code,
        new_status_code
    ]
    
    combined_features = []
    combined_features.extend(lead_embedding)
    combined_features.extend(status_embedding)
    combined_features.extend(feature_columns)
    
    # Convert to appropriate shape for model input
    X_input = np.array(combined_features).reshape(1, -1)
    
    # Predict probability
    probability = model.predict_proba(X_input)[0, 1]
    
    return probability

In [46]:
# Prepare the necessary components
embedding_dimension = next(iter(entity_to_embedding.values())).shape[0]

# Extract category mappings and maximum codes from training data as shown earlier

# Example deal
deal = {
    'lead_id': '123',
    'current_status_id': '100',
    'responsible_user_id': '42',
    'interaction_count': 5,
    'created_at': '2023-10-01T08:00:00',
    'change_date': '2023-10-10T10:00:00',
    'last_change_date': '2023-10-05T09:00:00'
}

df_merged['responsible_user_id'] = df_merged['responsible_user_id'].astype('category')
responsible_user_category_mapping = dict(enumerate(df_merged['responsible_user_id'].cat.categories))
responsible_user_category_mapping = {v: k for k, v in responsible_user_category_mapping.items()}

# Encoding new_status_id
df_merged['new_status_id'] = df_merged['new_status_id'].astype('category')
new_status_category_mapping = dict(enumerate(df_merged['new_status_id'].cat.categories))
new_status_category_mapping = {v: k for k, v in new_status_category_mapping.items()}

new_status_category_mapping = {v: k for k, v in new_status_category_mapping.items()}

# Predict the probability
probability = predict_transition_probability(
    deal,
    entity_to_embedding,
    model,
    responsible_user_category_mapping,
    new_status_category_mapping,
    embedding_dimension
)

print(f"Predicted Probability: {probability:.4f}")

TypeError: can only concatenate str (not "int") to str

In [40]:
total_triples = len(triples_array)
print(f"Общее количество триплетов в графе: {total_triples}")

Общее количество триплетов в графе: 241552


In [38]:
# Извлекаем все субъекты и объекты из триплетов
entities = set(triples_array[:, 0]).union(set(triples_array[:, 2]))
total_entities = len(entities)
print(f"Общее количество уникальных сущностей: {total_entities}")

Общее количество уникальных сущностей: 33


In [39]:
relations = set(triples_array[:, 1])
total_relations = len(relations)
print(f"Общее количество уникальных отношений: {total_relations}")

Общее количество уникальных отношений: 4


In [41]:
from collections import Counter

# Подсчёт количества связей для каждой сущности
degree_count = Counter()
for s, p, o in triples_array:
    degree_count[s] += 1
    degree_count[o] += 1

# Статистика по степеням
degrees = list(degree_count.values())
max_degree = max(degrees)
min_degree = min(degrees)
average_degree = sum(degrees) / total_entities

print(f"Максимальная степень сущности: {max_degree}")
print(f"Минимальная степень сущности: {min_degree}")
print(f"Средняя степень сущности: {average_degree:.2f}")

Максимальная степень сущности: 49600
Минимальная степень сущности: 868
Средняя степень сущности: 14639.52
