In [59]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

In [60]:
df = pd.read_csv("rental_info.csv")
df.head()

Unnamed: 0,rental_date,return_date,amount,release_year,rental_rate,length,replacement_cost,special_features,NC-17,PG,PG-13,R,amount_2,length_2,rental_rate_2
0,2005-05-25 02:54:33+00:00,2005-05-28 23:40:33+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
1,2005-06-15 23:19:16+00:00,2005-06-18 19:24:16+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
2,2005-07-10 04:27:45+00:00,2005-07-17 10:11:45+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
3,2005-07-31 12:06:41+00:00,2005-08-02 14:30:41+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401
4,2005-08-19 12:30:04+00:00,2005-08-23 13:35:04+00:00,2.99,2005.0,2.99,126.0,16.99,"{Trailers,""Behind the Scenes""}",0,0,0,1,8.9401,15876.0,8.9401


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15861 entries, 0 to 15860
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rental_date       15861 non-null  object 
 1   return_date       15861 non-null  object 
 2   amount            15861 non-null  float64
 3   release_year      15861 non-null  float64
 4   rental_rate       15861 non-null  float64
 5   length            15861 non-null  float64
 6   replacement_cost  15861 non-null  float64
 7   special_features  15861 non-null  object 
 8   NC-17             15861 non-null  int64  
 9   PG                15861 non-null  int64  
 10  PG-13             15861 non-null  int64  
 11  R                 15861 non-null  int64  
 12  amount_2          15861 non-null  float64
 13  length_2          15861 non-null  float64
 14  rental_rate_2     15861 non-null  float64
dtypes: float64(8), int64(4), object(3)
memory usage: 1.8+ MB


In [62]:
#calculating the amount of days movie was rented for
df["rental_date"] = pd.to_datetime(df["rental_date"])
df["return_date"] = pd.to_datetime(df["return_date"])
df["rental_length"] = (df["return_date"] - df["rental_date"]).dt.days

In [63]:
#calculating age of movie
from datetime import datetime
df["movie_age"] = datetime.now().year - df["release_year"]

In [64]:
df["deleted_scenes"] =  np.where(df["special_features"].str.contains("Deleted Scenes"), 1, 0)
df["behind_the_scenes"] =  np.where(df["special_features"].str.contains("Behind the Scenes"), 1, 0)

In [65]:
#dropping irrelevant columns
df = df.drop(columns = ["special_features", "amount_2", "length_2", "rental_rate_2"])
df.head()

Unnamed: 0,rental_date,return_date,amount,release_year,rental_rate,length,replacement_cost,NC-17,PG,PG-13,R,rental_length,movie_age,deleted_scenes,behind_the_scenes
0,2005-05-25 02:54:33+00:00,2005-05-28 23:40:33+00:00,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,3,19.0,0,1
1,2005-06-15 23:19:16+00:00,2005-06-18 19:24:16+00:00,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,2,19.0,0,1
2,2005-07-10 04:27:45+00:00,2005-07-17 10:11:45+00:00,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,7,19.0,0,1
3,2005-07-31 12:06:41+00:00,2005-08-02 14:30:41+00:00,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,2,19.0,0,1
4,2005-08-19 12:30:04+00:00,2005-08-23 13:35:04+00:00,2.99,2005.0,2.99,126.0,16.99,0,0,0,1,4,19.0,0,1


In [66]:
#seperating feature and target variables
drop = ["rental_date", "return_date", "rental_length", ]
X = df.drop(columns = ["rental_date", "return_date", "rental_length"])
y = df["rental_length"]

In [67]:
#spliting data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 43)

In [68]:
#creating a lasso model to identify important features in training set
lasso = Lasso(alpha = 0.2, random_state = 43)
lasso.fit(X_train, y_train)

In [69]:
lasso_coef = lasso.coef_

In [70]:
selected_features = X.columns[lasso.coef_ != 0]
print("Selected features:", selected_features)

Selected features: Index(['amount', 'rental_rate', 'length'], dtype='object')


In [71]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': np.arange(1,101,1),
    'max_depth':np.arange(1,11,1),
    'min_samples_leaf': np.arange(1, 11, 1)

    }
randomizedsearch = RandomizedSearchCV(rf, param_distributions = param_dist, cv = 5, random_state = 43)

In [72]:
randomizedsearch.fit(X_train, y_train)
best_parameters = randomizedsearch.best_params_
print(best_parameters)

{'n_estimators': 71, 'min_samples_leaf': 1, 'max_depth': 10}


In [73]:
rf = RandomForestRegressor(n_estimators = best_parameters["n_estimators"],
                           max_depth = best_parameters["max_depth"],
                           min_samples_leaf = best_parameters["min_samples_leaf"],
                           random_state = 43
                           )

In [74]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [75]:
mse = MSE(y_pred, y_test)
rmse = mse ** (1/2)
print(f"Mean Squared Error: {mse}, Root Mean Squared Error: {rmse}")

Mean Squared Error: 2.198215077617341, Root Mean Squared Error: 1.482637878113648
