In [3]:
import warnings
import pandas as pd
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

warnings.filterwarnings("ignore")

In [4]:
ratings_path = "ratings.csv"
movies_path = "movies.csv"
users_path = "users.csv"

ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)
users = pd.read_csv(users_path)

dataframe = ratings.merge(movies, on='MovieID').merge(users, on='UserID')

dataframe.head()

#assumir que history, foreign e tv movie não existem (retirar do dataset) - não é possível que aparecam

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Action,Adventure,Animation,Children's,Comedy,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),0,0,0,0,0,...,0,0,0,0,0,0,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),0,0,1,1,0,...,0,0,0,0,0,0,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),0,0,0,0,0,...,0,1,0,0,0,0,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),0,0,0,0,0,...,0,0,0,0,0,0,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",0,0,1,1,1,...,0,0,0,0,0,0,F,1,10,48067


In [5]:
movies.head()

Unnamed: 0,MovieID,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
dataframe.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp', 'Title', 'Action', 'Adventure', 'Animation', "Family", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western', 'Gender', 'Age', 'Occupation', 'Zip-code']

In [7]:
def build_user_profile(data):
    genre_columns = ['Action', 'Adventure', 'Animation', "Family", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
    user_profiles = data.groupby('UserID')[genre_columns].mean()
    return user_profiles

user_profiles = build_user_profile(dataframe)
user_profiles.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Family,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.09434,0.09434,0.339623,0.377358,0.264151,0.037736,0.0,0.396226,0.056604,0.0,0.0,0.264151,0.0,0.113208,0.056604,0.056604,0.037736,0.0
2,0.434109,0.147287,0.0,0.0,0.193798,0.093023,0.0,0.612403,0.007752,0.007752,0.015504,0.0,0.023256,0.186047,0.131783,0.24031,0.116279,0.023256
3,0.45098,0.490196,0.058824,0.058824,0.588235,0.0,0.0,0.156863,0.039216,0.0,0.058824,0.019608,0.019608,0.098039,0.117647,0.098039,0.039216,0.117647
4,0.904762,0.285714,0.0,0.047619,0.0,0.047619,0.0,0.285714,0.095238,0.0,0.142857,0.0,0.0,0.095238,0.428571,0.190476,0.142857,0.095238
5,0.156566,0.045455,0.020202,0.030303,0.282828,0.106061,0.030303,0.525253,0.0,0.015152,0.050505,0.015152,0.040404,0.151515,0.075758,0.19697,0.030303,0.005051


In [8]:
# Transformar variável Género em duas Variávies Binárias

gender_dummies = pd.get_dummies(dataframe['Gender'])
dataframe = pd.concat([dataframe, gender_dummies], axis = 1)

In [9]:
dataframe.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Action,Adventure,Animation,Family,Comedy,...,Science Fiction,Thriller,War,Western,Gender,Age,Occupation,Zip-code,F,M
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),0,0,0,0,0,...,0,0,0,0,F,1,10,48067,True,False
1,1,661,3,978302109,James and the Giant Peach (1996),0,0,1,1,0,...,0,0,0,0,F,1,10,48067,True,False
2,1,914,3,978301968,My Fair Lady (1964),0,0,0,0,0,...,0,0,0,0,F,1,10,48067,True,False
3,1,3408,4,978300275,Erin Brockovich (2000),0,0,0,0,0,...,0,0,0,0,F,1,10,48067,True,False
4,1,2355,5,978824291,"Bug's Life, A (1998)",0,0,1,1,1,...,0,0,0,0,F,1,10,48067,True,False


In [10]:
# Adicionar os perfis ao dataset

def merge_user_profiles(data, user_profiles):
    for col in user_profiles.columns:
        data[col + '_user_pref'] = data['UserID'].map(user_profiles[col])
    return data

dataframe = merge_user_profiles(dataframe, user_profiles)

In [11]:
# Separar Features em X e Y
X = dataframe[['Action', 'Adventure', 'Animation', "Family", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western', 'F', 'M', 'Age', 'Occupation', 'Action_user_pref', 'Adventure_user_pref', 'Animation_user_pref', "Family_user_pref", 'Comedy_user_pref', 'Crime_user_pref', 'Documentary_user_pref', 'Drama_user_pref', 'Fantasy_user_pref', 'Film-Noir_user_pref', 'Horror_user_pref', 'Music_user_pref', 'Mystery_user_pref', 'Romance_user_pref', 'Science Fiction_user_pref', 'Thriller_user_pref', 'War_user_pref', 'Western_user_pref']]
Y = dataframe ['Rating']

In [12]:
# Dividir os dados em Conjunto de Treino e de Conjunto de Teste
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
linear_regression_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
ada_boost = AdaBoostRegressor(n_estimators=50, random_state=42)
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [16]:
linear_regression_model = linear_regression_model.fit(X_train, Y_train)
# random_forest_model = random_forest_model.fit(X_train, Y_train)
# ada_boost_model = ada_boost.fit(X_train, Y_train)
# gradient_boosting = gb_regressor.fit(X_train, Y_train)

In [17]:
y_pred_linear_regression = linear_regression_model.predict(X_test)
# y_pred_random_forest = random_forest_model.predict(X_test)
# y_pred_ada_boost = ada_boost_model.predict(X_test)
# y_pred_gradient_boosting = gradient_boosting.predict(X_test)


In [18]:
rmse_LR = mean_squared_error(Y_test, y_pred_linear_regression, squared=False)
print(f"O RMSE é do LinearRegressor: {rmse_LR}")

O RMSE é do LinearRegressor: 1.091404262862537


### PREVISOR

In [19]:
genres_data_input = ['Action','Crime', 'Science Fiction']

user_data_input = {
    'F': 0,
    'M': 1,
    'Age': '22',
    'Occupation': 1}

In [20]:
user_preferences_mean = {
    'Action_user_pref': 0.262920,
    'Adventure_user_pref': 0.136199,
    'Animation_user_pref': 0.045813,
    "Family_user_pref": 0.069750,
    'Comedy_user_pref': 0.348596,
    'Crime_user_pref': 0.090367,
    'Documentary_user_pref': 0.007849,
    'Drama_user_pref': 0.365773,
    'Fantasy_user_pref': 0.036633,
    'Film-Noir_user_pref': 0.0222595,
    'Horror_user_pref': 0.071675,
    'Music_user_pref': 0.039808,
    'Mystery_user_pref': 0.041178,
    'Romance_user_pref': 0.147283,
    'Science Fiction_user_pref': 0.167730,
    'Thriller_user_pref': 0.195006,
    'War_user_pref': 0.078570,
    'Western_user_pref': 0.019571
}

In [21]:
def user_predictor(genres_data_input, user_data_input, user_preferences_mean, forecaster):
    
    all_genres = ['Action', 'Adventure', 'Animation', "Family", 'Comedy', 'Crime', 'Documentary',
              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Music', 'Mystery', 'Romance',
              'Science Fiction', 'Thriller', 'War', 'Western']
    
    genre_features = {genre: 1 if genre in genres_data_input else 0 for genre in all_genres}
    
    input_data = {**genre_features, **user_data_input, **user_preferences_mean}
    input_df = pd.DataFrame([input_data], index=[0])

    predicted_rating = forecaster .predict(input_df)[0]

    return predicted_rating

In [22]:
user_predictor(genres_data_input, user_data_input, user_preferences_mean, linear_regression_model)

3.4656863590717872