In [None]:
import pandas as pd
import numpy as np
import spacy
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, TFBertModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# Loading the dataset
file_path = 'smallest training.csv'
data = pd.read_csv(file_path)

# Dropping unimportant columns
columns_to_drop = ['beer/beerId', 'beer/brewerId', 'review/timeStruct', 'review/timeUnix',
                   'user/ageInSeconds', 'user/birthdayRaw', 'user/birthdayUnix',
                   'user/gender', 'user/profileName', 'index']

data = data.drop(columns=columns_to_drop, axis=1)

X = data.drop(columns='review/overall', axis=1)
Y= data['review/overall']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Loading the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Function for preprocessing the text using Spacy
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Loading pre-trained BERT tokenizer and model from Hugging face library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Function for tokenizing and converting texts to BERT embeddings
def tokenize_and_vectorize(text):
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='tf', padding=True, truncation=True)
    outputs = model(tokens)
    return tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()

# Applying BERT tokenization and vectorization to the columns having text data in both training and testing datasets
text_columns= ['beer/name','beer/style','review/text']
# Initializing lists for each column
all_texts_name = []
all_texts_style = []
all_texts_review = []

# preprocessing each text in the cells of each of the text_columns using spacy
for item in text_columns:
    if item == 'beer/name':
        all_texts_name.extend(preprocess_text(text) for text in X_train[item].astype(str))
    elif item == 'beer/style':
        all_texts_style.extend(preprocess_text(text) for text in X_train[item].astype(str))
    elif item == 'review/text':
        all_texts_review.extend(preprocess_text(text) for text in X_train[item].astype(str))

# Tokenizing and vectorizing using BERT 
X_train_bert_name = np.array([tokenize_and_vectorize(text) for text in all_texts_name])
X_train_bert_style = np.array([tokenize_and_vectorize(text) for text in all_texts_style])
X_train_bert_review = np.array([tokenize_and_vectorize(text) for text in all_texts_review])

# Doing the same process for the test dataset
all_texts_name_test = []
all_texts_style_test = []
all_texts_review_test = []

for item in text_columns:
    if item == 'beer/name':
        all_texts_name_test.extend(preprocess_text(text) for text in X_test[item].astype(str))
    elif item == 'beer/style':
        all_texts_style_test.extend(preprocess_text(text) for text in X_test[item].astype(str))
    elif item == 'review/text':
        all_texts_review_test.extend(preprocess_text(text) for text in X_test[item].astype(str))

X_test_bert_name = np.array([tokenize_and_vectorize(text) for text in all_texts_name_test])
X_test_bert_style = np.array([tokenize_and_vectorize(text) for text in all_texts_style_test])
X_test_bert_review = np.array([tokenize_and_vectorize(text) for text in all_texts_review_test])


# Flattening the BERT embeddings
X_train_bert_name_flat = X_train_bert_name.reshape(X_train_bert_name.shape[0], -1)
X_train_bert_style_flat = X_train_bert_style.reshape(X_train_bert_style.shape[0], -1)
X_train_bert_review_flat = X_train_bert_review.reshape(X_train_bert_review.shape[0], -1)

X_test_bert_name_flat = X_test_bert_name.reshape(X_test_bert_name.shape[0], -1)
X_test_bert_style_flat = X_test_bert_style.reshape(X_test_bert_style.shape[0], -1)
X_test_bert_review_flat = X_test_bert_review.reshape(X_test_bert_review.shape[0], -1)


#numerical features from X_train
dx_train=X_train.drop(columns=text_columns,axis=1)
dx_test=X_test.drop(columns=text_columns,axis=1)

# Concatenating BERT embeddings with the remaining numerical features of X_train
X_train_combined = np.concatenate([X_train_bert_name_flat,X_train_bert_style_flat,X_train_bert_review_flat, dx_train.values], axis=1)
X_test_combined = np.concatenate([X_test_bert_name_flat,X_test_bert_style_flat,X_test_bert_review_flat,dx_test.values], axis=1)

# Defining the parameter grid for Random Forest Regressor
param_grid_rf = {
    'randomforestregressor__n_estimators': [50, 100, 150],
    'randomforestregressor__max_depth': [None, 10, 20],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4]
}


# Defining the parameter grid for Support Vector Regressor
param_grid_svr = {
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.1, 0.2, 0.5],
    'svr__kernel': ['linear', 'rbf']
}

# Creating a pipeline for Random Forest Regressor
rf_pipe = make_pipeline(RandomForestRegressor(random_state=42))

# Creating a pipeline for Support Vector Regressor with standardization
svr_pipe = make_pipeline(StandardScaler(), SVR())

# Creating GridSearchCV objects for both models
grid_search_rf = GridSearchCV(rf_pipe, param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_svr = GridSearchCV(svr_pipe, param_grid_svr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the models with grid search
grid_search_rf.fit(X_train_combined, y_train)
grid_search_svr.fit(X_train_combined, y_train)

# Retreiving the best models
best_model_rf = grid_search_rf.best_estimator_
best_model_svr = grid_search_svr.best_estimator_

# Making predictions on the test set using the best models
predictions_rf = best_model_rf.predict(X_test_combined)
predictions_svr = best_model_svr.predict(X_test_combined)

# Model Validation Metrics
mse_rf = mean_squared_error(y_test, predictions_rf)
mae_rf=mean_absolute_error(y_test,predictions_rf)
r2_rf = r2_score(y_test, predictions_rf)

mse_svr = mean_squared_error(y_test, predictions_svr)
mae_rf=mean_absolute_error(y_test,predictions_rf)
r2_svr = r2_score(y_test, predictions_svr)


print("Random Forest Regressor:")
print(f'Best parameters: {grid_search_rf.best_params_}')
print(f'Mean Squared Error: {mse_rf}')
print(f'Mean Absolute Error: {msae_rf}')
print(f'R-squared Score: {r2_rf}')
print("\nSupport Vector Regressor:")
print(f'Best parameters: {grid_search_svr.best_params_}')
print(f'Mean Squared Error: {mse_svr}')
print(f'Mean Absolute Error: {mae_svr}')
print(f'R-squared Score: {r2_svr}')










