In [1]:
import model_api as dl
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
sample_model = dl.NN_classifier(10, 10, verbose=True)
sample_cnn = dl.CNN_classifier(10, 10, verbose=True)
sample_xgboost = dl.XGB_classifier(n_estimators=100)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10, 3)]           0         
                                                                 
 flatten (Flatten)           (None, 30)                0         
                                                                 
 dense (Dense)               (None, 64)                1984      
                                                                 
 dense_1 (Dense)             (None, 10)                650       
                                                                 
Total params: 2,634
Trainable params: 2,634
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10, 3

In [6]:
data_path = "data/df_top_20_movies_customers_reviewed_all.csv"
movie_20 = pd.read_csv(data_path)
display(movie_20.head())

unique_customers = movie_20['Cust_Id'].unique()
print("Number of Unique Customers: ", len(unique_customers))

# The small number of datapoints calls for XGBoost.

Unnamed: 0,Movie_Id,Cust_Id,Rating,Date,Movie_Year,Name,Average_Rating,Review_Count
0,571,1844276,5.0,2002-03-05,1999,American Beauty,3.962585,154832
1,571,2422606,1.0,2001-11-20,1999,American Beauty,3.962585,154832
2,571,1515501,3.0,2002-11-25,1999,American Beauty,3.962585,154832
3,571,181323,4.0,2002-01-22,1999,American Beauty,3.962585,154832
4,571,243963,4.0,2002-02-26,1999,American Beauty,3.962585,154832


Number of Unique Customers:  4481


In [27]:
# Find the unique movie names
unique_movies = movie_20['Name'].unique()
#print(unique_movies)
print("Number of Unique Movies: ", len(unique_movies))

# Split data into examples and labels.
example_movies = unique_movies[:10]
label_movies = unique_movies[10:]

print("Number of Example Movies: ", len(example_movies))
print("Number of Label Movies: ", len(label_movies))

# Pivot the dataset on unique customers, and add movies to the lists based on their type
raw_X_list = []
raw_Y_list = []

example_features = ["Rating", "Movie_Year", "Average_Rating", "Review_Count"]

for customer in unique_customers:
    customer_data = movie_20.loc[movie_20['Cust_Id'] == customer]
    
    # Ensure movies are in the same order every time
    example_data = customer_data.loc[customer_data['Name'].isin(example_movies)].copy()
    label_data = customer_data.loc[customer_data['Name'].isin(label_movies)].copy()
    
    example_data = example_data.sort_values(by='Name')
    label_data = label_data.sort_values(by='Name')
    
    # Create example matrix.. flatten it
    example_data = np.array(example_data[example_features]).flatten()
    
    # Identify the movie with the highest rating from label_movies
    highest_rating_index = label_data['Rating'].idxmax()
    highest_rating_movie = label_data.loc[highest_rating_index, 'Name']
    
    # Find the index of the highest rated movie within label_movies
    label_index = list(label_movies).index(highest_rating_movie)
    
    
    raw_X_list.append(example_data)
    raw_Y_list.append(label_index)

Number of Unique Movies:  20
Number of Example Movies:  10
Number of Label Movies:  10


In [39]:
# Convert lists to DataFrames
num_example_movies = len(example_movies)
columns = []
for i in example_movies:
    columns.extend([f"Rating_{i}", f"Movie_Year_{i}", f"Average_Rating_{i}", f"Review_Count_{i}"])

# Convert lists to DataFrames
X = pd.DataFrame(raw_X_list, columns=columns)
Y = pd.DataFrame(raw_Y_list, columns=["Movie Label"])

print("Example Data Shape: ", X.shape)
print("Label Data Shape: ", Y.shape)

# Display first few rows of the DataFrames
display(X.head())
display(Y.head())

Example Data Shape:  (4481, 40)
Label Data Shape:  (4481, 1)


Unnamed: 0,Rating_American Beauty,Movie_Year_American Beauty,Average_Rating_American Beauty,Review_Count_American Beauty,Rating_The Wedding Planner,Movie_Year_The Wedding Planner,Average_Rating_The Wedding Planner,Review_Count_The Wedding Planner,Rating_Man on Fire,Movie_Year_Man on Fire,...,Average_Rating_The Bourne Supremacy,Review_Count_The Bourne Supremacy,Rating_Lord of the Rings: The Fellowship of the Ring,Movie_Year_Lord of the Rings: The Fellowship of the Ring,Average_Rating_Lord of the Rings: The Fellowship of the Ring,Review_Count_Lord of the Rings: The Fellowship of the Ring,Rating_Braveheart,Movie_Year_Braveheart,Average_Rating_Braveheart,Review_Count_Braveheart
0,2.0,2004.0,3.750569,145519.0,5.0,1999.0,3.962585,154832.0,5.0,1995.0,...,3.909958,137170.0,3.0,2001.0,3.183726,140154.0,5.0,2000.0,3.425322,162597.0
1,5.0,2004.0,3.750569,145519.0,1.0,1999.0,3.962585,154832.0,5.0,1995.0,...,3.909958,137170.0,3.0,2001.0,3.183726,140154.0,4.0,2000.0,3.425322,162597.0
2,2.0,2004.0,3.750569,145519.0,3.0,1999.0,3.962585,154832.0,4.0,1995.0,...,3.909958,137170.0,2.0,2001.0,3.183726,140154.0,2.0,2000.0,3.425322,162597.0
3,3.0,2004.0,3.750569,145519.0,4.0,1999.0,3.962585,154832.0,5.0,1995.0,...,3.909958,137170.0,4.0,2001.0,3.183726,140154.0,5.0,2000.0,3.425322,162597.0
4,5.0,2004.0,3.750569,145519.0,4.0,1999.0,3.962585,154832.0,5.0,1995.0,...,3.909958,137170.0,4.0,2001.0,3.183726,140154.0,5.0,2000.0,3.425322,162597.0


Unnamed: 0,Movie Label
0,5
1,3
2,6
3,4
4,6


In [40]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Set up the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Set up the GridSearchCV object with K-fold cross-validation
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Perform the grid search and fit to the training data
grid_search.fit(X_train, Y_train.values.ravel())

# Retrieve the best model from the grid search
best_model = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Predict on the test set with the best model
Y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({
    'Feature': columns,
    'Importance': best_model.feature_importances_
})

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 54.52%


In [41]:
# Display the DataFrame sorted by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
display(feature_importances)

Unnamed: 0,Feature,Importance
0,Rating_American Beauty,0.238479
24,Rating_What Women Want,0.12693
36,Rating_Braveheart,0.120547
20,Rating_50 First Dates,0.090802
32,Rating_Lord of the Rings: The Fellowship of th...,0.089614
16,Rating_Pirates of the Caribbean: The Curse of ...,0.082111
28,Rating_The Bourne Supremacy,0.074972
4,Rating_The Wedding Planner,0.062323
12,Rating_S.W.A.T.,0.060102
8,Rating_Man on Fire,0.05412
