In [214]:
import mlflow
import mlflow.sklearn
import polars as pl
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

In [215]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment('movie-rating-classifier')


<Experiment: artifact_location='file:///home/temur/yyy/ml_flow_task/mlruns/941774096314097402', creation_time=1731320051489, experiment_id='941774096314097402', last_update_time=1731320051489, lifecycle_stage='active', name='movie-rating-classifier', tags={}>

In [216]:
# Load and preprocess user data
user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pl.read_csv('ml-100k/u.user', has_header=False, separator='|', new_columns=user_columns)
users = users.drop('zip_code')
# Encode gender and occupation as dummies
gender_dummies = users.select(pl.col("gender")).to_dummies()
occupation_dummies = users.select(pl.col("occupation")).to_dummies()

# Concatenate encoded features
users_encoded = pl.concat([users.drop(["gender", "occupation"]), gender_dummies, occupation_dummies], how="horizontal")
users_encoded.head()

user_id,age,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,occupation_homemaker,occupation_lawyer,occupation_librarian,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
i64,i64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
1,24,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,53,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,23,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,24,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5,33,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [217]:
# Load and preprocess movie data
movie_columns = ["item_id", "movie_title", "release_date", "video_release_date", "IMDb_URL"] + [f"genre_{i}" for i in range(19)]
movies = pl.read_csv('ml-100k/u.item', separator="|", has_header=False, new_columns=movie_columns, encoding="iso-8859-1", ignore_errors=True)
movies = movies.drop('IMDb_URL', 'movie_title', 'release_date', 'video_release_date')
movies.head()

item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [218]:

# Load ratings data
ratings = pl.read_csv('ml-100k/u.data', separator='\t', has_header=False, new_columns=['user_id', 'item_id', 'rating', 'timestamp'])
ratings.head()


user_id,item_id,rating,timestamp
i64,i64,i64,i64
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


In [230]:
# Join data
users_ratings = users_encoded.join(ratings, on="user_id", how="left")
combined_data = users_ratings.join(movies, on="item_id", how="left")
combined_data.tail()

user_id,age,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,occupation_homemaker,occupation_lawyer,occupation_librarian,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,item_id,rating,timestamp,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
i64,i64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
943,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,415,1,888640027,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
943,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,219,4,888639575,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
943,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,796,3,888640311,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
943,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,739,4,888639929,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
943,22,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,391,2,888640291,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [220]:
# Age categorization
def categorize_age(age):
    if age <= 20:
        return "age1"
    elif 20<age<=35:
        return "age2"
    elif 35<age<=50:
        return "age3"
    elif 50<age<=60:
        return "age3"
    else:
        return "age4"

In [221]:

# Prepare data for training
X = combined_data.select(pl.all().exclude(['rating', 'timestamp', 'user_id', 'item_id'])).to_pandas()
y = combined_data.select(['rating'])

X['age_category']=X['age'].apply(categorize_age)
X = X.drop(columns=['age'])

dummies = pd.get_dummies(X['age_category'], prefix='age')

# Join the dummies back to the original DataFrame
X = pd.concat([X, dummies], axis=1)

# Drop the original 'age_category' column if not needed
X = X.drop(columns=['age_category'])
X.head()

Unnamed: 0,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,...,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,age_age1,age_age2,age_age3,age_age4
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,False,True,False,False
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,False,True,False,False


In [222]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)  
random_state=42 

In [227]:
# I would use grid search, but to work with mlflow I am writing such code
with mlflow.start_run():
    estimators=100
    max_depth=10
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("n_estimators", estimators)
    mlflow.log_param("max depth", max_depth)
    model = RandomForestClassifier(n_estimators=estimators,max_depth=max_depth,random_state=random_state)  
    model.fit(X_train,y_train)
    mlflow.sklearn.log_model(model, "random_forest_model")
    y_test_pred = model.predict(X_test)    
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mlflow.log_metric("rmse", rmse)

  return fit_method(estimator, *args, **kwargs)
2024/11/11 16:50:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-wren-712 at: http://127.0.0.1:5000/#/experiments/941774096314097402/runs/59a56311591644fab7a020254afb3745.
2024/11/11 16:50:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/941774096314097402.


In [228]:
import mlflow.sklearn

model_uri = "models:/with_least_rmse/3"  
model = mlflow.sklearn.load_model(model_uri)

In [229]:
import skl2onnx
import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

onnx_model = convert_sklearn(model, initial_types=[('input', FloatTensorType([None, X_train.shape[1]]))])

onnx.save_model(onnx_model, 'app/with_least_rmse.onnx')
