In [4]:

# # PLANORA User Category Preference Prediction

# ## Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import os

# Set random seed for reproducibility
np.random.seed(42)

# ## Step 2: Load or Generate Data
# Try loading the dataset. If it’s missing or has incorrect features, generate a synthetic dataset with the correct features.
def generate_synthetic_dataset(n_samples=250):
    categories = ['Tech', 'Music', 'Sports', 'Business', 'Education', 'Art', 'Food', 'Health']
    data = {
        'age': np.random.randint(18, 66, n_samples),
        'gender': np.random.choice(['Male', 'Female', 'Other'], n_samples),
        'tech_spend': np.random.randint(1, 6, n_samples),
        'music_freq': np.random.randint(1, 6, n_samples),
        'sports_hours': np.random.randint(1, 6, n_samples),
        'business_interest': np.random.randint(1, 6, n_samples),
        'edu_freq': np.random.randint(1, 6, n_samples),
        'food_interest': np.random.randint(1, 6, n_samples),
        'health_priority': np.random.randint(1, 6, n_samples),
        'preferred_category': np.random.choice(categories, n_samples)
    }
    # Adjust scores to correlate with categories
    for i in range(n_samples):
        category = data['preferred_category'][i]
        if category == 'Tech':
            data['tech_spend'][i] = np.random.randint(4, 6)
        elif category == 'Music':
            data['music_freq'][i] = np.random.randint(4, 6)
        elif category == 'Sports':
            data['sports_hours'][i] = np.random.randint(4, 6)
        elif category == 'Business':
            data['business_interest'][i] = np.random.randint(4, 6)
        elif category == 'Education':
            data['edu_freq'][i] = np.random.randint(4, 6)
        elif category == 'Food':
            data['food_interest'][i] = np.random.randint(4, 6)
        elif category == 'Health':
            data['health_priority'][i] = np.random.randint(4, 6)
        elif category == 'Art':
            data['food_interest'][i] = np.random.randint(3, 6)  # Art may correlate with food
    return pd.DataFrame(data)

# Load or generate dataset
dataset_path = 'user_preferences_dataset.csv'
required_columns = ['age', 'gender', 'tech_spend', 'music_freq', 'sports_hours', 
                    'business_interest', 'edu_freq', 'food_interest', 'health_priority', 
                    'preferred_category']

if os.path.exists(dataset_path):
    df = pd.read_csv(dataset_path)
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}. Generating synthetic dataset.")
        df = generate_synthetic_dataset()
else:
    print("Dataset not found. Generating synthetic dataset.")
    df = generate_synthetic_dataset()

# Save the dataset for reference
df.to_csv('user_preferences_dataset.csv', index=False)

# Display dataset info
print("First 5 rows of the dataset:")
print(df.head())
print("\nMissing values:")
print(df.isnull().sum())
print("\nDataset info:")
print(df.info())
print("\nUnique categories in preferred_category:")
print(df['preferred_category'].value_counts())

# ## Step 3: Preprocess Data
# Encode categorical variables, scale numerical features, and split data.

# Separate features and target
X = df.drop('preferred_category', axis=1)
y = df['preferred_category']

# Encode categorical variables
le_gender = LabelEncoder()
X['gender'] = le_gender.fit_transform(X['gender'])

le_category = LabelEncoder()
y = le_category.fit_transform(y)

# Scale numerical features
numerical_cols = ['age', 'tech_spend', 'music_freq', 'sports_hours', 
                  'business_interest', 'edu_freq', 'food_interest', 'health_priority']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nShape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

# ## Step 4: Train Initial Models
# Train baseline models: Logistic Regression, k-NN, and Random Forest.

# Dictionary to store model performance
model_scores = {}

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
model_scores['Logistic Regression'] = accuracy_score(y_test, y_pred_lr)

# k-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
model_scores['k-NN'] = accuracy_score(y_test, y_pred_knn)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
model_scores['Random Forest'] = accuracy_score(y_test, y_pred_rf)

# Display baseline accuracies
print("\nBaseline Model Accuracies:")
for model, score in model_scores.items():
    print(f"{model}: {score:.4f}")

# ## Step 5: Hyperparameter Tuning
# Tune models using GridSearchCV.

# Logistic Regression Tuning
lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [1000]
}
lr_grid = GridSearchCV(LogisticRegression(random_state=42), lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train, y_train)
model_scores['Tuned Logistic Regression'] = lr_grid.best_score_
print("\nBest Logistic Regression Parameters:", lr_grid.best_params_)
print("Best Logistic Regression CV Accuracy:", lr_grid.best_score_)

# k-NN Tuning
knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy')
knn_grid.fit(X_train, y_train)
model_scores['Tuned k-NN'] = knn_grid.best_score_
print("\nBest k-NN Parameters:", knn_grid.best_params_)
print("Best k-NN CV Accuracy:", knn_grid.best_score_)

# Random Forest Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
model_scores['Tuned Random Forest'] = rf_grid.best_score_
print("\nBest Random Forest Parameters:", rf_grid.best_params_)
print("Best Random Forest CV Accuracy:", rf_grid.best_score_)

# ## Step 6: Evaluate Models
# Evaluate the best model on the test set.

# Select the best model
best_model_name = max(model_scores, key=model_scores.get)
print(f"\nBest Model: {best_model_name} with CV Accuracy: {model_scores[best_model_name]:.4f}")

# Get the best model
if best_model_name.startswith('Tuned Logistic'):
    best_model = lr_grid.best_estimator_
elif best_model_name.startswith('Tuned k-NN'):
    best_model = knn_grid.best_estimator_
else:
    best_model = rf_grid.best_estimator_

# Evaluate on test set
y_pred_best = best_model.predict(X_test)
print("\nTest Set Evaluation for Best Model:")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best, target_names=le_category.classes_))

# ## Step 7: Save and Predict
# Save model and preprocessing objects, and test prediction.

# Save objects
joblib.dump(best_model, 'best_category_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le_gender, 'le_gender.pkl')
joblib.dump(le_category, 'le_category.pkl')

print("\nModel and preprocessing objects saved.")

# Test prediction
sample = X_test.iloc[0:1].copy()
print("\nSample Input Features:")
print(sample)
sample_pred = best_model.predict(sample)
sample_category = le_category.inverse_transform(sample_pred)[0]
print(f"Predicted Category for Sample: {sample_category}")

Missing columns: ['tech_spend', 'music_freq', 'sports_hours', 'edu_freq', 'health_priority']. Generating synthetic dataset.
First 5 rows of the dataset:
   age  gender  tech_spend  music_freq  sports_hours  business_interest  \
0   56    Male           1           5             2                  2   
1   46   Other           5           3             4                  1   
2   32  Female           5           5             1                  5   
3   60    Male           4           5             5                  1   
4   25   Other           2           5             2                  2   

   edu_freq  food_interest  health_priority preferred_category  
0         4              5                5               Food  
1         3              2                5             Health  
2         5              5                2               Food  
3         2              3                5               Tech  
4         2              5                4               Food  

Missi