In [None]:
from __future__ import annotations

# Download and install English language model for spaCy

In [None]:
! python -m spacy download en_core_web_sm

# Import all libraries

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Import data

In [None]:
df = pd.read_csv('starter/data/reviews.csv')
df.head()

# DF stats

In [None]:
df.describe()

# Check Nulls

In [None]:
df.isnull().sum()

# Separate features from labels

In [None]:
X = df.drop('Recommended IND' , axis= 1)
Y = df['Recommended IND'].copy()

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=27, shuffle=True)

# Split data into numerical, categorical, and text features

In [None]:
# Select columns that are numeric 
num_features = X.select_dtypes(exclude=['object']).columns
print('Numerical features:', num_features)

# Select columns that are categorical 
cat_features = X.select_dtypes(include=['object']).columns
print('Categorical features:', cat_features)

# Select column with review text
text_features = X[['Review Text']].columns
print('Review Text features:', text_features)

# Show first rows of the dataset to check
df.head()

# Pipeline for num features

In [None]:
num_pipeline = Pipeline([
    # fill missing values with most frequent value
    ('imputer', SimpleImputer(strategy='median')),  
    
    # scale numbers to range 0-1
    ('scaler', MinMaxScaler())
])

num_pipeline

# Pipeline for cat features

In [None]:
# Pipeline for cat features
cat_pipeline = Pipeline([     
    # fill missing values with most frequent value
    ('imputer', SimpleImputer(strategy='most_frequent')),
    
    # create one-hot columns (0/1) for each category
    ('cat_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

cat_pipeline

# Text features, Count Characters Class

In [None]:
class CountCharacter(BaseEstimator, TransformerMixin):
    def __init__(self, character: str):
        # Character we want to count
        self.character = character

    def fit(self, X, y=None):
        # Just return self
        return self

    def transform(self, X):
        # Count how many times the character appears in each text
        # (text or "") -> handle None values as empty string
        return [[(text or "").count(self.character)] for text in X]

# Preprocess text to make it 1D for transformers

In [None]:
initial_text_preprocess = Pipeline([('dimension_reshaper', FunctionTransformer(lambda x: x.values.ravel()))])

# Create features by counting specific characters
feature_engineering = FeatureUnion([
    ('count_spaces', CountCharacter(character=' ')),      # count spaces
    ('count_exclamations', CountCharacter(character='!')), # count exclamation marks
    ('count_question_marks', CountCharacter(character='?')), # count question marks
])

# Combine preprocessing and feature engineering into one pipeline
character_counts_pipeline = Pipeline([
    ('initial_text_preprocess', initial_text_preprocess), # reshape text
    ('feature_engineering', feature_engineering),         # count characters
])

character_counts_pipeline

# custom transformer for lemmatizing text and removing stopwords

In [None]:
class SpacyLemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        # Store the spaCy model (nlp) to use for processing text
        self.nlp = nlp

    def fit(self, X, y=None):
        # just return self
        return self

    def transform(self, X):
        # Process each text in X
        lemmatized = [
            ' '.join(
                token.lemma_ for token in doc  # take the base form of each word
                if not token.is_stop           # skip stopwords like "the", "and", "is"
            )
            for doc in self.nlp.pipe(X, batch_size=50)        # use spaCy to process all texts batch size - performance
        ]
        return lemmatized                     # return list of cleaned, lemmatized texts

# TF-IDF features

In [None]:
tfidf_pipeline = Pipeline([
    # Reshape input data to a 1D array
    ('dimension_reshaper', FunctionTransformer(np.reshape, kw_args={'newshape': -1})),
    
    # Lemmatize the text using spaCy
    ('lemmatizer', SpacyLemmatizer(nlp=nlp)),
    
    # Convert processed text into a TF-IDF matrix
    ('tfidf_vectorizer', TfidfVectorizer())  
])
tfidf_pipeline

# Combine all feature processing steps into one transformer

In [None]:
full_preprocessor = ColumnTransformer([
    # Numeric pipeline: applies transformations to numerical features
    ('num', num_pipeline, num_features),
    
    # Categorical pipeline: handles encoding/processing of categorical features
    ('cat', cat_pipeline, cat_features),
    
    # Character counts pipeline: extracts custom features such as text length,
    ('character_counts', character_counts_pipeline, text_features),
    
    # TF-IDF text pipeline: transforms text columns into TF-IDF vectors
    ('tfidf_text', tfidf_pipeline, text_features),
])
feature_engineering

# Train Model

In [None]:
# Create a complete machine learning pipeline
model_pipeline = make_pipeline(
    
    # Preprocessing - applies all feature engineering transformations
    full_preprocessor,
    
     # Model - a Random Forest classifier
    RandomForestClassifier(random_state=27, class_weight='balanced')
)

# Fit the pipeline on the training data
model_pipeline.fit(X_train, y_train)

# Evaluate

In [None]:
# Make predictions on the test set using the trained pipeline
y_pred = model_pipeline.predict(X_test)

# Print accuracy score (overall percentage of correct predictions)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Print classification report (precision, recall, f1-score, support per class)
print('\nClassification Report:\n', classification_report(y_test, y_pred))

# Print confusion matrix (shows correct vs. incorrect predictions for each class)
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))

# Fine-Tune Model

In [None]:
# Define the hyperparameter search space for RandomForestClassifier
param_distributions = dict(
    randomforestclassifier__max_features=['sqrt', 'log2'],
    randomforestclassifier__n_estimators=[150, 200],
)
# Create a randomized search with cross-validation
param_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distributions,
    n_iter=6,
    cv=5,
    n_jobs=-1,
    refit=True,
    verbose=3,
    random_state=27
)

# Perform the randomized search on training data
param_search.fit(X_train, y_train)


print('Best Params:', param_search.best_params_)

In [None]:
# Extract the best pipeline (with the best hyperparameters) from the search
model_best = param_search.best_estimator_

# Use the tuned model to make predictions on the test set
y_pred_best = model_best.predict(X_test)

In [None]:
# Print accuracy score of the tuned model
print('Tuned Accuracy:', accuracy_score(y_test, y_pred_best))

# Print classification report (precision, recall, f1-score for each class)
print('\nTuned Classification Report:\n', classification_report(y_test, y_pred_best))

# Print confusion matrix (comparison of true vs. predicted labels)
print('\nTuned Confusion Matrix:\n', confusion_matrix(y_test, y_pred_best))
