In [3]:
import pandas as pd
import spacy
from sklearn.preprocessing import normalize

# Load the spaCy model
nlp = spacy.load('en_core_web_md')

# Load the data
data = pd.read_csv('/workspaces/ML_ZOOMCAMP_2024/video_game_recommender/data/video_game_reviews.csv')  # Adjust the path accordingly

# Preprocess column names
data.columns = [col.lower().replace(' ', '_') for col in data.columns]

def process_data(data, categorical_cols):
    # Initialize an empty string for the combined features
    data['combined_features'] = ''

    # Loop through the categorical columns to combine features
    for col in categorical_cols:
        data['combined_features'] += data[col].astype(str) + ' '

    # Include numerical features in the combined features
    data['combined_features'] += 'Rating_' + data['user_rating'].astype(str) + ' ' + \
                                 'Price_' + data['price'].astype(str) + ' '

    return data

# Ensure combined_features column is treated as strings
data['combined_features'] = data['combined_features'].fillna('').astype(str)


# Combined text cleaning and vectorization function
def clean_and_vectorize(text):
    print(f"Processing text: {text}")  # Debugging line
    doc = nlp(text)
    cleaned_text = ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    print(f"Cleaned text: {cleaned_text}, Vector: {doc.vector}")  # Debugging line
    return cleaned_text, doc.vector.tolist()  # Ensure to return as a list

# Apply the cleaning and vectorization function to the dataset
data[['cleaned_combined_features', 'combined_features_vector']] = data['combined_features']\
    .apply(clean_and_vectorize).apply(pd.Series)

# Remove duplicates based on 'game_title'
data = data.drop_duplicates(subset=['game_title'])

# Normalize the vectors to unit length
normalized_vectors = normalize(data['combined_features_vector'].tolist(), norm='l2', axis=1)

# Split the normalized vectors into separate columns
vector_columns = pd.DataFrame(normalized_vectors,
                              columns=[f'combined_features_vec_{i}' for i in range(len(normalized_vectors[0]))])

# Concatenate the normalized vectors to the main dataframe and drop the original vector column
data = pd.concat([data, vector_columns], axis=1).drop(columns=['combined_features', 'combined_features_vector'])

# Final dataset
print(data.head())

KeyError: 'combined_features'

: 