In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from mlflow import pyfunc
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

Importing the dataset

In [2]:
# Read the CSV file
df = pd.read_csv("/Users/karthikeyans/kdemo/lib/Recommendation/eventsData4Processed.csv", parse_dates=['date'])

In [3]:
df.head(2)

Unnamed: 0,event_id,domain,title,date,time,location,speaker,event_mode,description
0,0,Cloud Computing,AWS Cloud Computing Event,2024-09-14,10:00,Rourkela,Michael Johnson,Meetup,Learn cloud computing fundamentals with AWS
1,1,Education,Education Administrator Education Event,2023-06-12,10:00,Villupuram,Daniel Johnson,Webinar,Create engaging learning experiences with Educ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   event_id     423 non-null    int64         
 1   domain       423 non-null    object        
 2   title        423 non-null    object        
 3   date         423 non-null    datetime64[ns]
 4   time         423 non-null    object        
 5   location     423 non-null    object        
 6   speaker      423 non-null    object        
 7   event_mode   423 non-null    object        
 8   description  423 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 29.9+ KB


In [5]:
description = df["description"]

In [6]:
df['features'] = df['description'] + ' ' + df['domain'] + ' ' + df['title']



Text proccessing

In [7]:
# Remove punctuation
def remove_punctuation(text):
    punctuation_list = string.punctuation + "-"
    return text.translate(str.maketrans(' ',' ',punctuation_list))

In [8]:
df["processed_feature"] = df['features'].apply(lambda x: remove_punctuation(x))
df_lookup = df.copy()

In [9]:
class LemmaTokenizer (object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [10]:
# Create vectorizer of ngrams using 
tfv = TfidfVectorizer( ngram_range=(1,1), analyzer='word', stop_words='english',tokenizer=LemmaTokenizer())

# Fit vectorizer on processed description
sparse_vector_matrix = tfv.fit_transform(df["processed_feature"] )



In [11]:
#pca = PCA(n_components=2)

In [12]:

'''def visualization(inputted_vector):
    dense_matrix = sparse_vector_matrix.toarray()

    # Use PCA to reduce dimensionality for visualization
    
    reduced_data = pca.fit_transform(dense_matrix)

    # Create a scatter plot
    plt.figure(figsize=(10, 7))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1])

    # Optionally, plot the input vector in a different color
    # Assuming `inputted_vector` is the vector for the input event
    inputted_vector_reduced = pca.transform(inputted_vector)
    plt.scatter(inputted_vector_reduced[0, 0], inputted_vector_reduced[0, 1], c='red')

    plt.title('Scatter Plot of Data Points in Vector Space')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.show()'''

"def visualization(inputted_vector):\n    dense_matrix = sparse_vector_matrix.toarray()\n\n    # Use PCA to reduce dimensionality for visualization\n    \n    reduced_data = pca.fit_transform(dense_matrix)\n\n    # Create a scatter plot\n    plt.figure(figsize=(10, 7))\n    plt.scatter(reduced_data[:, 0], reduced_data[:, 1])\n\n    # Optionally, plot the input vector in a different color\n    # Assuming `inputted_vector` is the vector for the input event\n    inputted_vector_reduced = pca.transform(inputted_vector)\n    plt.scatter(inputted_vector_reduced[0, 0], inputted_vector_reduced[0, 1], c='red')\n\n    plt.title('Scatter Plot of Data Points in Vector Space')\n    plt.xlabel('First Principal Component')\n    plt.ylabel('Second Principal Component')\n    plt.show()"

In [13]:
from sklearn.neighbors import NearestNeighbors
# Build KNN model
knn_model = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')  
knn_model.fit(sparse_vector_matrix) 

Build prediction function

In [14]:
def recommendation(model_input, tfv=tfv,df_lookup=df_lookup):
    # Convert the input dates to datetime objects for comparison
    begin_date = pd.to_datetime(model_input['beginDate'])
    end_date = pd.to_datetime(model_input['endDate'])

    
    # Process the events as before
    input_df = pd.DataFrame([model_input], columns=['description', 'goalName','beginDate','endDate'])
    input_df['description'] = input_df['description'].astype(str)
    input_df['goalName'] = input_df['goalName'].astype(str)
    
    input_df['description'] = input_df['description'].apply(lambda x: ' '.join(x.split()))
    input_df['goalname'] = input_df['goalName'].apply(lambda x: ' '.join(x.split()))
    input_df['bag_of_words'] = input_df['description'] + ' ' + input_df['goalname']
    
    inputted_vector = tfv.transform(input_df['bag_of_words'])
    
    # Find k-nearest neighbors
    distances, indices = knn_model.kneighbors(inputted_vector)

    # Filter events that fall between beginDate and endDate
    filtered_indices_between = []
    for idx in indices[0]:
        event_date = pd.to_datetime(df_lookup.iloc[idx]['date'])
        if begin_date <= event_date <= end_date:
            filtered_indices_between.append(idx)
            if len(filtered_indices_between) >= 10:  # Limit to 10 recommendations
                break

    # Get the recommended events falling between beginDate and endDate
    recommended_events_between = df_lookup.iloc[filtered_indices_between][['title', 'date', 'time']]

    filtered_indices_after = []
    for idx in indices[0]:
        event_date = pd.to_datetime(df_lookup.iloc[idx]['date'])
        if event_date > end_date:
            filtered_indices_after.append(idx)
            if len(filtered_indices_after) >= 10:  # Limit to 10 recommendations
                break
    # Optionally, plot the input vector in a different color
    # Assuming `inputted_vector` is the vector for the input event
    #inputted_vector_reduced = pca.transform(inputted_vector)

    #visualization(inputted_vector_reduced)

    # Get the recommended events falling after endDate
    recommended_events_after = df_lookup.iloc[filtered_indices_after][['title', 'date', 'time']]
    
    return recommended_events_between,recommended_events_after

Defining custom model for Event Recommendation

In [15]:
class eventRecommendationKNN(pyfunc.PythonModel):
    ## defining objects needed for prediction
    def __init__(self,
                tfv,
                sparse_vector_matrix,
                df_lookup,
                recommendation):
        
        ## setting up all needed objects
        self.tfv = tfv
        self.sparse_vector_matrix = sparse_vector_matrix
        self.df_lookup = df_lookup
        self.recommendation = recommendation
        
    ## define function with processing and feeding data into prediction at the end
    def predict(self,context,model_input):
        recommended_events_between_dates,recommended_events_after_end_date = self.recommendation(model_input)
        #print(recommended_events_between_dates)
        #print(recommended_events_after_end_date)
        predictions = {
            'recommended_events_between_dates': recommended_events_between_dates.to_dict('records'),
            'recommended_events_after_end_date': recommended_events_after_end_date.to_dict('records')
        }

        return predictions

Testing the prediction class

In [16]:
m = eventRecommendationKNN(tfv = tfv,
                        sparse_vector_matrix=sparse_vector_matrix,
                        df_lookup=df_lookup,
                        recommendation=recommendation)

model_input = {
    "goalName" : "data analyst",
    "description" : "data analyst",
    "beginDate" : "2024-10-12",
    "endDate" : "2024-12-13"
}
output_df = m.predict(None,model_input)
print(output_df)

{'recommended_events_between_dates': [{'title': 'Risk Analyst Data Analysis Event', 'date': Timestamp('2024-12-11 00:00:00'), 'time': '13:00'}, {'title': 'Customer Insights Analyst Data Analysis Event', 'date': Timestamp('2024-11-29 00:00:00'), 'time': '18:00'}, {'title': 'Market Research Analyst Data Analysis Event', 'date': Timestamp('2024-11-30 00:00:00'), 'time': '18:00'}], 'recommended_events_after_end_date': []}


In [17]:
mlflow_conda_env ={
    'name': 'mlflow-env',
    'channels': ['defaults',
                 'conda-forge'],
    'dependencies': ['python = 3.6.2',
                     'nltk = 3.8.1',
                     'nltk_data',
                    {'pip': ['mlflow==2.10.2',
                    'scikit-learn',
                    'cloudpickle=1.2.2']}
                        ]
}

In [18]:
with mlflow .start_run(run_name="Content-Based event recommendation using KNN") as run:
    mlflow.log_param("num_events_returned", 10)
    # Log parameters
    mlflow.log_param("ngram_range", (1,1))
    mlflow.log_param("analyzer", 'word')
    mlflow.log_param("stop_words", 'english')
   
    pyfunc.log_model(
        artifact_path = "KNN_event_recommendation_pyfunc",
        python_model = eventRecommendationKNN(tfv = tfv,
                        sparse_vector_matrix=sparse_vector_matrix,
                        df_lookup=df_lookup,
                        recommendation=recommendation),
        
        conda_env = mlflow_conda_env
    )
    
    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    
    
    mlflow.end_run()  

