In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
# Loading and Exploring the Data
user_data = pd.read_csv('../data/datasets/user_data.csv')
product_data = pd.read_csv('../data/datasets/product_data.csv')

# Merge data on common key for joint recommendation analysis
data = pd.merge(user_data, product_data, how='inner', on='product_id')

# Exploratory data analysis
print(data.info())
print(data.describe())

In [3]:
# Preprocessing
# Handle missing values
data.fillna('', inplace=True)

In [4]:
# Feature Engineering
vectorizer = TfidfVectorizer(max_features=1000)
product_descriptions = vectorizer.fit_transform(data['product_description']).toarray()

# Add features to the dataset
data = pd.concat([data, pd.DataFrame(product_descriptions)], axis=1)

In [5]:
# Target Encoding
X = data.drop(columns=['user_rating'])
y = data['user_rating']

In [6]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Training a Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [9]:
# Evaluating the Model
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

In [10]:
# Additional Evaluation Metrics
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [11]:
# Error Metrics
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

In [12]:
# Saving the Model
joblib.dump(model, '../models/recommendation_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

In [13]:
# Model Loading and Testing
loaded_model = joblib.load('../models/recommendation_model.pkl')
loaded_scaler = joblib.load('../models/scaler.pkl')

In [14]:
# Prediction on New Data
new_data = pd.DataFrame({
    'product_description': ['New product description here'],
})
new_data_transformed = vectorizer.transform(new_data['product_description']).toarray()
new_data_scaled = loaded_scaler.transform(new_data_transformed)

# Prediction
predicted_rating = loaded_model.predict(new_data_scaled)
print(f'Predicted Rating: {predicted_rating[0]}')

In [15]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, verbose=1)
grid_search.fit(X_train_scaled, y_train)

print(f'Best Hyperparameters: {grid_search.best_params_}')

In [16]:
# Retraining with Best Parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

In [17]:
# Final Evaluation on Test Data
y_pred_best = best_model.predict(X_test_scaled)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best * 100:.2f}%')

In [18]:
# Visualizing Feature Importances
importance = best_model.coef_[0]
indices = np.argsort(importance)[::-1]

plt.figure(figsize=(10,6))
plt.title("Feature Importances")
plt.bar(range(len(indices)), importance[indices], color="b", align="center")
plt.show()

In [19]:
# Cross Validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean():.4f}')

In [20]:
# Save Best Model
joblib.dump(best_model, '../models/best_recommendation_model.pkl')