In [None]:
from __future__ import annotations

# Download and install English language model for spaCy

In [None]:
! python -m spacy download en_core_web_sm

# Import all libraries

In [106]:
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Import data

In [None]:
df = pd.read_csv('starter/data/reviews.csv')
df.head()

# DF stats

In [None]:
df.describe()

# Check Nulls

In [None]:
df.isnull().sum()

# Separate features from labels

In [None]:

X = df.drop('Recommended IND' , axis= 1)
Y = df['Recommended IND'].copy()

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=27, shuffle=True)

# Split data into numerical, categorical, and text features

In [None]:
# Select columns that are numeric 
num_features = X.select_dtypes(exclude=['object']).columns
print('Numerical features:', num_features)

# Select columns that are categorical 
cat_features = X.select_dtypes(include=['object']).columns
print('Categorical features:', cat_features)

# Select column with review text
text_features = X[['Review Text']].columns
print('Review Text features:', text_features)

# Show first rows of the dataset to check
df.head()

# Pipeline for num features

In [None]:
num_pipeline = Pipeline([
    # fill missing values with most frequent value
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    
    # scale numbers to range 0-1
    ('scaler', MinMaxScaler())
])

num_pipeline

# Pipeline for cat features

In [None]:
# Pipeline for cat features
cat_pipeline = Pipeline([     
    # fill missing values with most frequent value
    ('imputer', SimpleImputer(strategy='most_frequent')),
    
    # convert cat to num (each unique category -> a number)
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    
    # create one-hot columns (0/1) for each category
    ('cat_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

cat_pipeline

# Text features, Count Characters Class

In [None]:
class CountCharacter(BaseEstimator, TransformerMixin):
    def __init__(self, character: str):
        # Character we want to count
        self.character = character

    def fit(self, X, y=None):
        # Just return self
        return self

    def transform(self, X):
        # Count how many times the character appears in each text
        # (text or "") -> handle None values as empty string
        return [[(text or "").count(self.character)] for text in X]

# Preprocess text to make it 1D for transformers

In [None]:
initial_text_preprocess = Pipeline([
    (
        'dimension_reshaper',
        FunctionTransformer(
            np.reshape,          # reshape array
            kw_args={'newshape':-1},  # make 1D
        ),
    ),
])

# Create features by counting specific characters
feature_engineering = FeatureUnion([
    ('count_spaces', CountCharacter(character=' ')),      # count spaces
    ('count_exclamations', CountCharacter(character='!')), # count exclamation marks
    ('count_question_marks', CountCharacter(character='?')), # count question marks
])

# Combine preprocessing and feature engineering into one pipeline
character_counts_pipeline = Pipeline([
    ('initial_text_preprocess', initial_text_preprocess), # reshape text
    ('feature_engineering', feature_engineering),         # count characters
])

character_counts_pipeline

# custom transformer for lemmatizing text and removing stopwords

In [97]:
class SpacyLemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        # Store the spaCy model (nlp) to use for processing text
        self.nlp = nlp

    def fit(self, X, y=None):
        # just return self
        return self

    def transform(self, X):
        # Process each text in X
        lemmatized = [
            ' '.join(
                token.lemma_ for token in doc  # take the base form of each word
                if not token.is_stop           # skip stopwords like "the", "and", "is"
            )
            for doc in self.nlp.pipe(X)        # use spaCy to process all texts 
        ]
        return lemmatized                     # return list of cleaned, lemmatized texts

# TF-IDF features

In [None]:
tfidf_pipeline = Pipeline([
    # reshape input to 1D array 
    (
        'dimension_reshaper',
        FunctionTransformer(
            np.reshape,
            kw_args={'newshape':-1},
        ),
    ),
    # lemmatize text and remove stopwords
    (
        'lemmatizer',
        SpacyLemmatizer(nlp=nlp),
    ),
    # convert text into TF-IDF numeric features
    (
        'tfidf_vectorizer',
        TfidfVectorizer(
            stop_words='english',  # remove common English words
        ),
    ),
])

tfidf_pipeline

# Combine all feature processing steps into one transformer

In [None]:
feature_engineering = ColumnTransformer([
    # process num features with num_pipeline
    ('num', num_pipeline, num_features),
    
    # process cat features with cat_pipeline
    ('cat', cat_pipeline, cat_features),
    
    # count special characters in text (spaces, !, ?)
    ('character_counts', character_counts_pipeline, text_features),
    
    # process text into TF-IDF features
    ('tfidf_text', tfidf_pipeline, text_features),
])

feature_engineering

# Train Model

In [104]:
model_pipeline = make_pipeline(
    feature_engineering,
    RandomForestClassifier(random_state=27),
)

model_pipeline.fit(X_train, y_train)

# Testing accuracy

In [107]:
y_pred_forest_pipeline = model_pipeline.predict(X_test)
accuracy_forest_pipeline = accuracy_score(y_test, y_pred_forest_pipeline)

print('Accuracy:', accuracy_forest_pipeline)



Accuracy: 0.8487394957983193


# Fine-Tune Model

In [108]:
# define the parameter grid to search over
my_distributions = dict(
    randomforestclassifier__max_features=[  # number of features considered for each split
        100,
        150,
        250,
    ],
    randomforestclassifier__n_estimators=[  # number of trees in the forest
        150,
        200,
    ],
)

# create RandomizedSearchCV to find the best hyperparameters
param_search = RandomizedSearchCV(
    estimator=model_pipeline,      # the pipeline including preprocessing + RandomForest
    param_distributions=my_distributions,  # the parameters to try
    n_iter=6,                      # try 6 random combinations of parameters
    cv=5,                          # 5-fold cross-validation
    n_jobs=-1,                     # use all available CPU cores
    refit=True,                     # after search, refit pipeline with the best parameters
    verbose=3,                     # print progress and scores
    random_state=27,               # for reproducibility
)

# fit the RandomizedSearchCV on the training data
param_search.fit(X_train, y_train)

# retrieve the best combination of parameters
param_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


{'randomforestclassifier__n_estimators': 200,
 'randomforestclassifier__max_features': 250}

In [None]:
model_best = param_search.best_estimator_
model_best

In [110]:
y_pred_forest_pipeline = model_best.predict(X_test)
accuracy_forest_pipeline = accuracy_score(y_test, y_pred_forest_pipeline)

print('Accuracy:', accuracy_forest_pipeline)



Accuracy: 0.8517213336947682
