In [2]:
# Import libraries
import re
import sys
from hashlib import sha1
from pandas_profiling import ProfileReport
import altair as alt

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# train test split and cross validation
from sklearn.model_selection import (
    train_test_split,
)

########

import os

%matplotlib inline
import string
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

# data
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer

# Feature selection
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer

# classifiers / models
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV

# other
from sklearn.metrics import accuracy_score, log_loss, make_scorer, mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler,
)
from sklearn.svm import SVC, SVR
from sklearn.metrics import (
    accuracy_score,
    f1_score
)

### --------------------------------------------------------------------------------------------------------------------------------------------

### Below is the modeling part with RandomForest

In [3]:
# Reading in the training data from our repo
train_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/group_26/main/data/processed/train.csv")
train_df

Unnamed: 0,Id,Text,Author,Rating,n_words,sentiment
0,9794,Bill Murray is a wonderful comic who has troub...,Rhodes,7.0,854,compound
1,6767,"(Universal) Starring: John Cleese, Jamie Lee C...",Renshaw,5.0,1041,compound
2,5073,"Cast: Michael J. Nelson, Trace Beaulieu, Kevin...",Berardinelli,6.0,769,compound
3,28773,Marzieh Meshkini's THE DAY I BECAME A WOMAN (R...,Rhodes,6.0,437,compound
4,22863,MAGNOLIA (director/writer: Paul Thomas Anderso...,Schwartz,7.0,1642,neu
...,...,...,...,...,...,...
3999,4149,"Cast: Patrick Swayze, Mary Elizabeth Mastranto...",Berardinelli,6.2,917,compound
4000,22385,Since TUMBLEWEEDS opened right on the heels of...,Rhodes,6.0,530,compound
4001,2547,Consider this premise: a private investigator ...,Berardinelli,5.1,680,compound
4002,14956,What's left for peasants to do when the owner ...,Rhodes,4.0,636,neu


In [4]:
# Reading in the test data from our repo
test_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/group_26/main/data/processed/test.csv")
test_df

Unnamed: 0,Id,Text,Author,Rating,n_words,sentiment
0,22754,MARIE BAIE DES ANGES (ANGEL SHARKS)(director/w...,Schwartz,5.0,1150,neu
1,2437,"Starring: Wesley Snipes, Michael Wright, There...",Berardinelli,5.3,758,neu
2,2578,"Starring: Billy Crystal, Daniel Stern, Jon Lov...",Berardinelli,4.7,841,compound
3,2144,"Starring: Jason London, Wiley Wiggins, Rory Co...",Renshaw,7.0,793,compound
4,2098,"Starring: Brad Pitt, Juliette Lewis, David Duc...",Berardinelli,5.9,912,neu
...,...,...,...,...,...,...
997,13974,"BLADE (New Line) Starring: Wesley Snipes, Step...",Renshaw,5.0,898,neu
998,29393,"TOGETHER (TILLSAMMANS), by writer and director...",Rhodes,5.0,403,compound
999,24842,MIFUNE (Mifunes Sidste Sang)(director/writer: ...,Schwartz,5.0,1256,compound
1000,8340,"Belgium/France, 1996 U.S. Release Date: beginn...",Berardinelli,8.0,1004,compound


In [5]:
# Creating X-train, y_train and X_test , y_test
X_train, y_train = train_df.drop(columns=['Rating']), train_df['Rating']
X_test, y_test = test_df.drop(columns=['Rating']), test_df['Rating']
X_train

Unnamed: 0,Id,Text,Author,n_words,sentiment
0,9794,Bill Murray is a wonderful comic who has troub...,Rhodes,854,compound
1,6767,"(Universal) Starring: John Cleese, Jamie Lee C...",Renshaw,1041,compound
2,5073,"Cast: Michael J. Nelson, Trace Beaulieu, Kevin...",Berardinelli,769,compound
3,28773,Marzieh Meshkini's THE DAY I BECAME A WOMAN (R...,Rhodes,437,compound
4,22863,MAGNOLIA (director/writer: Paul Thomas Anderso...,Schwartz,1642,neu
...,...,...,...,...,...
3999,4149,"Cast: Patrick Swayze, Mary Elizabeth Mastranto...",Berardinelli,917,compound
4000,22385,Since TUMBLEWEEDS opened right on the heels of...,Rhodes,530,compound
4001,2547,Consider this premise: a private investigator ...,Berardinelli,680,compound
4002,14956,What's left for peasants to do when the owner ...,Rhodes,636,neu


In [6]:
# Identifying features
numeric_features = ['n_words']
text_feature = 'Text'
ordinal_features = ['sentiment']
drop_features = ['Id', 'Author']
target = 'Rating'

In [7]:
# Creating transformers
preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(max_features=20_000, max_df=0.6), text_feature),
            ('num', StandardScaler(), numeric_features),
            ('ord', OrdinalEncoder(categories=[['neg', 'compound', 'neu', 'pos']]), ordinal_features)
        ]
    )

### Modeling with RandomForestRegressor

In [8]:
# Creating RandomForest pipeline

pipe_rf = make_pipeline(preprocessor, RandomForestRegressor(random_state=26))

In [9]:
# Fitting pipe_rf
# pipe_rf.fit(X_train, y_train)

In [28]:
# Defining the param_grid
param_grid1 = {
        'randomforestregressor__max_depth': [int(x) for x in np.linspace(5, 30, num = 6)]
    }

In [29]:
param_grid

{'randomforestregressor__max_depth': [5, 10, 15, 20, 25, 30]}

In [30]:
hyper_parameters_search = GridSearchCV(pipe_rf, param_grid=param_grid1, n_jobs=-1, scoring='r2')
hyper_parameters_search.fit(X_train, y_train)
print(f'R2 score for best model: {hyper_parameters_search.best_score_}')

R2 score for best model: 0.359660014327661


In [31]:
hyper_parameters_search.best_params_

{'randomforestregressor__max_depth': 30}

In [11]:
# Defining the param_grid
param_grid2 = {
        'randomforestregressor__n_estimators': [1, 10, 100]
    }

In [None]:
hyper_parameters_search = GridSearchCV(pipe_rf, param_grid=param_grid2, n_jobs=-1, scoring='r2')
hyper_parameters_search.fit(X_train, y_train)
print(f'R2 score for best model: {hyper_parameters_search.best_score_}')

In [45]:
hyper_parameters_search.best_params_

{'randomforestregressor__n_estimators': 100}

In [None]:
hyper_parameters_search = GridSearchCV(pipe_rf, param_grid=param_grid2, n_jobs=-1, scoring='neg_root_mean_squared_error')
hyper_parameters_search.fit(X_train, y_train)
print(f'Negative RMSE for best model: {hyper_parameters_search.best_score_}')

### -----------------------------------------------------------------------------------------------------------------------------------------------