In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import nltk
import warnings
from pandarallel import pandarallel 
warnings.filterwarnings("ignore")

# Download necessary NLTK data
nltk.download('punkt')

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
pandarallel.initialize(progress_bar=True)

# Load full training data
print("Loading full training data...")
train_data_full = pd.read_csv('train.csv')

# Load test data
print("Loading test data...")
test_data = pd.read_csv('test.csv')

# Reduce the dataset size for memory constraints
print("Reducing the training dataset size for memory constraints...")
train_data = train_data_full.sample(frac=1, random_state=42).reset_index(drop=True)

# Ensure 'Text' column exists and is string type in train_data
if 'Text' not in train_data.columns:
    print("Warning: 'Text' column not found in train data.")
    train_data['Text'] = ''
train_data['Text'] = train_data['Text'].fillna('').astype(str)

# Apply feature extraction on training data
def extract_features(df, is_test=False, product_avg_score=None, user_avg_score=None):
    # Ensure 'Text' column exists and is string type
    if 'Text' not in df.columns:
        print("Warning: 'Text' column not found. Creating empty 'Text' column.")
        df['Text'] = ''
    df['Text'] = df['Text'].fillna('').astype(str)

    # Apply sentiment analysis and extract all scores
    print("Applying sentiment analysis...")
    def sentiment_scores(text):
        scores = analyzer.polarity_scores(text)
        return pd.Series({
            'Sentiment_Neg': scores['neg'],
            'Sentiment_Neu': scores['neu'],
            'Sentiment_Pos': scores['pos'],
            'Sentiment_Compound': scores['compound']
        })
    sentiment_df = df['Text'].parallel_apply(sentiment_scores)
    df = pd.concat([df.reset_index(drop=True), sentiment_df.reset_index(drop=True)], axis=1)
    # sentiment_df.reset_index(drop=True, inplace=True)
    # df.reset_index(drop=True, inplace=True)
    # df = pd.concat([df, sentiment_df], axis=1)

    df['TextLength'] = df['Text'].str.len()
    df['ExclamationCount'] = df['Text'].str.count("!")
    df['QuestionCount'] = df['Text'].str.count(r"\?")
    df['CapitalRatio'] = df['Text'].str.count(r'[A-Z]') / (df['TextLength'] + 1)
    # Count uppercase words
    df['UppercaseWords'] = df['Text'].str.count(r'\b[A-Z]{2,}\b')

    # Map sentiment to a 1-5 range
    df['SentimentMapped'] = ((df['Sentiment_Compound'] + 1) * 2) + 1  # Maps range -1 to 1 onto 1 to 5

    # Convert Sentiment into integer levels
    df['SentimentLevel'] = df['Sentiment_Compound'].apply(
        lambda x: 1 if x <= -0.6 else 2 if x <= -0.2 else 3 if x <= 0.2 else 4 if x <= 0.6 else 5
    )

    # Compute HelpfulnessRatio if columns are present
    if 'HelpfulnessNumerator' in df.columns and 'HelpfulnessDenominator' in df.columns:
        print("Computing HelpfulnessRatio...")
        df['HelpfulnessRatio'] = df['HelpfulnessNumerator'] / (df['HelpfulnessDenominator'] + 1)
    else:
        print("Warning: 'HelpfulnessNumerator' or 'HelpfulnessDenominator' column not found. Default 'HelpfulnessRatio' will be used.")
        df['HelpfulnessRatio'] = 0.0

    # Interaction features
    df['Sentiment_Helpfulness'] = df['Sentiment_Compound'] * df['HelpfulnessRatio']
    df['Sentiment_TextLength'] = df['Sentiment_Compound'] * df['TextLength']

    # Product/User average score as a feature
    if not is_test and 'Score' in df.columns:
        # Ensure 'Score' is numeric
        df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
        # Drop rows with NaN 'Score' before computing averages
        df_non_null = df.dropna(subset=['Score'])
        print("Computing product and user average scores...")
        if 'ProductId' in df.columns:
            product_avg_score = df_non_null.groupby('ProductId')['Score'].mean().reset_index().rename(columns={'Score': 'ProductAvgScore'})
            # Merge these scores back into the main DataFrame
            df = df.merge(product_avg_score, on='ProductId', how='left')
        else:
            print("Warning: 'ProductId' column not found. Skipping 'ProductAvgScore' computation.")
            df['ProductAvgScore'] = df['Score'].mean()

        if 'UserId' in df.columns:
            user_avg_score = df_non_null.groupby('UserId')['Score'].mean().reset_index().rename(columns={'Score': 'UserAvgScore'})
            # Merge these scores back into the main DataFrame
            df = df.merge(user_avg_score, on='UserId', how='left')
        else:
            print("Warning: 'UserId' column not found. Skipping 'UserAvgScore' computation.")
            df['UserAvgScore'] = df['Score'].mean()

    else:
        print("Using precomputed average scores for 'ProductAvgScore' and 'UserAvgScore'.")
        if 'ProductId' in df.columns and product_avg_score is not None:
            df = df.merge(product_avg_score, on='ProductId', how='left')
        else:
            print("Warning: 'ProductId' column not found or 'product_avg_score' not provided. Using default.")
            df['ProductAvgScore'] = train_data['Score'].mean()

        if 'UserId' in df.columns and user_avg_score is not None:
            df = df.merge(user_avg_score, on='UserId', how='left')
        else:
            print("Warning: 'UserId' column not found or 'user_avg_score' not provided. Using default.")
            df['UserAvgScore'] = train_data['Score'].mean()

    # Handle missing values in the features
    print("Handling missing values...")
    df.fillna({
        'SentimentMapped': 3, 'SentimentLevel': 3, 'HelpfulnessRatio': 0.0, 'TextLength': 0,
        'ExclamationCount': 0, 'QuestionCount': 0, 'CapitalRatio': 0.0, 'UppercaseWords': 0,
        'Sentiment_Helpfulness': 0.0, 'Sentiment_TextLength': 0.0, 'ProductAvgScore': train_data['Score'].mean(), 'UserAvgScore': train_data['Score'].mean(),
        'Sentiment_Neg': 0.0, 'Sentiment_Neu': 0.0, 'Sentiment_Pos': 0.0, 'Sentiment_Compound': 0.0
    }, inplace=True)

    return df

# Extract features from training data
print("Extracting features from training data...")
train_data = extract_features(train_data)
print("Feature extraction on training data complete.")

# Prepare training data by dropping rows with missing 'Score'
print("Preparing training data...")
train_data = train_data.dropna(subset=['Score'])
train_data['Score'] = train_data['Score'].astype(int)

# Ensure 'Text' is string type
train_data['Text'] = train_data['Text'].fillna('').astype(str)

# Compute product and user average scores on training data
print("Computing overall product and user average scores from training data...")
if 'ProductId' in train_data.columns:
    product_avg_score = train_data.groupby('ProductId')['Score'].mean().reset_index().rename(columns={'Score': 'ProductAvgScore'})
else:
    product_avg_score = None

if 'UserId' in train_data.columns:
    user_avg_score = train_data.groupby('UserId')['Score'].mean().reset_index().rename(columns={'Score': 'UserAvgScore'})
else:
    user_avg_score = None

# Set features and target
feature_columns = ['SentimentMapped', 'SentimentLevel', 'HelpfulnessRatio', 'TextLength',
                   'ExclamationCount', 'QuestionCount', 'CapitalRatio', 'UppercaseWords',
                   'Sentiment_Helpfulness', 'Sentiment_TextLength', 'ProductAvgScore', 'UserAvgScore',
                   'Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos', 'Sentiment_Compound']
X_numeric = train_data[feature_columns]
y = train_data['Score']

# Compute TF-IDF features
print("Computing TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer on the training data
X_text_tfidf = tfidf_vectorizer.fit_transform(train_data['Text'])
print("TF-IDF feature computation complete.")

# Combine numeric and text features
print("Combining numeric and text features...")
X_numeric_scaled = StandardScaler().fit_transform(X_numeric)
X_combined = hstack([csr_matrix(X_numeric_scaled), X_text_tfidf])

# Train the logistic regression model with best parameters
print("Training logistic regression model with best hyperparameters...")
log_reg_model = LogisticRegression(
    max_iter=3000,
    solver='newton-cg',
    C=0.5,
    n_jobs=-1
)
log_reg_model.fit(X_combined, y)
print("Model training complete.")

# Evaluate model with cross-validation
print("Evaluating model with cross-validation...")
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(log_reg_model, X_combined, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation accuracy: {cv_scores.mean():.4f}")

# Prepare test data
print("Merging test data with full train data to get 'Text' column...")
# Ensure 'Id's in train_data_full are unique
train_data_full_unique = train_data_full.drop_duplicates(subset='Id')
test_data = test_data.merge(train_data_full_unique[['Id', 'Text', 'ProductId', 'UserId', 'HelpfulnessNumerator', 'HelpfulnessDenominator']], on='Id', how='left')

# Check if any 'Text' values are missing after the merge
missing_text = test_data['Text'].isnull().sum()
if missing_text > 0:
    print(f"Warning: {missing_text} entries in test data do not have corresponding 'Text' in train data.")
    # Optionally, fill 'Text' with empty strings
    test_data['Text'] = test_data['Text'].fillna('')

# Apply feature extraction on test data and handle missing values
print("Extracting features from test data...")
test_data = extract_features(test_data, is_test=True, product_avg_score=product_avg_score, user_avg_score=user_avg_score)
print("Feature extraction for test data complete.")

# Ensure 'Text' is string type
test_data['Text'] = test_data['Text'].fillna('').astype(str)

# Prepare test features
print("Preparing test features...")
X_test_numeric = test_data[feature_columns]
X_test_numeric_scaled = StandardScaler().fit_transform(X_test_numeric)

X_test_text_tfidf = tfidf_vectorizer.transform(test_data['Text'])
print("Combining test numeric and text features...")
X_test_combined = hstack([csr_matrix(X_test_numeric_scaled), X_test_text_tfidf])

# Predict scores on test data
print("Predicting scores on test data...")
test_data['Score'] = log_reg_model.predict(X_test_combined)

# Clip predictions to be within the valid range and save output to CSV
test_data['Score'] = test_data['Score'].round().clip(1, 5).astype(int)
print("Saving predictions to CSV...")
test_data[['Id', 'Score']].to_csv('option2.csv', index=False)

print("Score prediction complete; file saved as 'option2.csv'.")


[nltk_data] Downloading package punkt to /Users/mariohysa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Loading full training data...
Loading test data...
Reducing the training dataset size for memory constraints...
Extracting features from training data...
Applying sentiment analysis...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=154322), Label(value='0 / 154322')…

Computing HelpfulnessRatio...
Computing product and user average scores...
Handling missing values...
Feature extraction on training data complete.
Preparing training data...
Computing overall product and user average scores from training data...
Computing TF-IDF features...
TF-IDF feature computation complete.
Combining numeric and text features...
Training logistic regression model with best hyperparameters...
Model training complete.
Evaluating model with cross-validation...
Cross-validation accuracy: 0.6642
Merging test data with full train data to get 'Text' column...
Extracting features from test data...
Applying sentiment analysis...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19291), Label(value='0 / 19291')))…

Computing HelpfulnessRatio...
Using precomputed average scores for 'ProductAvgScore' and 'UserAvgScore'.
Handling missing values...
Feature extraction for test data complete.
Preparing test features...
Combining test numeric and text features...
Predicting scores on test data...
Saving predictions to CSV...
Score prediction complete; file saved as 'option2.csv'.
