In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load data
data = pd.read_csv('train.csv')

# 1. Data Cleaning: Handle missing values
data = data.drop(columns=['index', 'review/timeStruct', 'user/birthdayRaw', 'user/birthdayUnix'])
data['review/text'] = data['review/text'].fillna('')  # Filling missing review texts with empty strings
data['user/gender'] = data['user/gender'].fillna('Unknown')

# Filling missing values for user-related columns with median
data['user/ageInSeconds'] = data['user/ageInSeconds'].fillna(data['user/ageInSeconds'].median())

# 2. Feature Engineering: Process text and numeric columns
# TfidfVectorizer for review text
tfidf = TfidfVectorizer(max_features=100, stop_words='english')

# Combine numerical features
numeric_features = ['beer/ABV', 'review/appearance', 'review/aroma', 
                    'review/palate', 'review/taste', 'user/ageInSeconds']

# Scaling numerical features
scaler = StandardScaler()

# 3. Split the data
X = data.drop(columns=['review/overall'])
y = data['review/overall']

# Preprocessing pipeline: Combine Tfidf and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features),
        ('tfidf', tfidf, 'review/text')
    ])

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Modeling: RandomForest and Ridge Regression
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Ridge": Ridge(alpha=1.0)
}

# Evaluate both models
for model_name, model in models.items():
    # Create a pipeline with preprocessing and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{model_name} Model")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}\n")


RandomForest Model
Mean Squared Error: 0.15949145
R^2 Score: 0.6757760735456895

Ridge Model
Mean Squared Error: 0.1559721131984372
R^2 Score: 0.6829303955881428

