# Libraries

In [None]:
import pickle
import pandas as pd
import seaborn as sns
from os.path import exists
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from textblob import TextBlob
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier





# Loading the Files

Download the csv files into the `data/` directory.

In [None]:
trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv")

print("train.csv shape is ", trainingSet.shape)
print("test.csv shape is ", testingSet.shape)

print()

print(trainingSet.head())
print()
print(testingSet.head())

print()

print(trainingSet.describe())

trainingSet['Score'].value_counts().plot(kind='bar', legend=True, alpha=.5)
plt.show()

print()
print("EVERYTHING IS PROPERLY SET UP! YOU ARE READY TO START")

# Adding Features

In [3]:
def filter_and_normalize(df, exclude_columns=['Score', 'Id', 'Time'], method='minmax'):
    # Separate the columns to exclude
    excluded_df = df[exclude_columns] if all(col in df.columns for col in exclude_columns) else pd.DataFrame()

    # Filter numeric columns excluding the specified columns
    numeric_df = df.drop(columns=exclude_columns, errors='ignore').select_dtypes(include=['number'])

    # Choose the normalization method
    if method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'zscore':
        scaler = StandardScaler()
    else:
        raise ValueError(f"Unknown method '{method}'")

    # Normalize the numeric columns
    numeric_df_scaled = pd.DataFrame(scaler.fit_transform(numeric_df), columns=numeric_df.columns)

    # Concatenate the excluded columns back with the normalized numeric columns
    result_df = pd.concat([excluded_df.reset_index(drop=True), numeric_df_scaled], axis=1)

    return result_df


In [None]:
def add_features_to(df):
    # Initialize VADER Sentiment Analyzer
    sia = SentimentIntensityAnalyzer()

    # This is where you can do all your feature extraction
    print("Starting feature extraction...")

    # Track progress for Helpfulness
    print("Calculating Helpfulness...")
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    # Track progress for ReviewLengthWords
    print("Calculating ReviewLengthWords...")
    df['ReviewLengthWords'] = df['Text'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

    # Sentiment, Subjectivity, and VADER Sentiment with progress tracking
    def get_sentiment_subjectivity_vader(text, index):
        if isinstance(text, str):  # Check if the text is a string
            # TextBlob for sentiment and subjectivity
            blob = TextBlob(text)
            sentiment = blob.sentiment.polarity  # Get sentiment polarity (-1 to 1)
            subjectivity = blob.sentiment.subjectivity  # Get subjectivity (0 to 1)
            
            # VADER for additional sentiment analysis
            vader_scores = sia.polarity_scores(text)
            vader_compound = vader_scores['compound']  # VADER compound score (-1 to 1)
        else:
            sentiment = 0  # For null or non-string values
            subjectivity = 0
            vader_compound = 0

        if index % 100000 == 0:  # Print progress every 100000 rows
            print(f"Processed {index} rows for sentiment, subjectivity, and VADER sentiment...")
        return sentiment, subjectivity, vader_compound

    print("Calculating Sentiment, Subjectivity, and VADER Sentiment...")
    df[['Sentiment', 'Subjectivity', 'VaderSentiment']] = df.apply(
        lambda row: pd.Series(get_sentiment_subjectivity_vader(row['Text'], row.name)), axis=1
    )

    print("Calculating Summary Sentiment, Subjectivity, and VADER Sentiment...")
    df[['SummarySentiment', 'SummarySubjectivity', 'SummaryVaderSentiment']] = df.apply(
        lambda row: pd.Series(get_sentiment_subjectivity_vader(row['Summary'], row.name)), axis=1
    )

    # Adding mean ProductScore and mean UserScore
    print("Calculating meanProductScore and meanUserScore...")

    # Calculate mean score for each ProductId, ignoring missing scores
    mean_product_score = df.groupby('ProductId')['Score'].mean()

    # Calculate mean score for each UserId, ignoring missing scores
    mean_user_score = df.groupby('UserId')['Score'].mean()

    # Merge these means back into the dataframe
    df = df.merge(mean_product_score.rename('meanProductScore'), on='ProductId', how='left')
    df = df.merge(mean_user_score.rename('meanUserScore'), on='UserId', how='left')

    print("Feature extraction complete.")
    return df


# Load the feature extracted files if they've already been generated
if exists('./data/X_train.csv'):
    X_train = pd.read_csv("./data/X_train.csv")
if exists('./data/X_submission.csv'):
    X_submission = pd.read_csv("./data/X_submission.csv")

else:
    # Process the DataFrame
    train = add_features_to(trainingSet)
    # train = filter_and_normalize(train)
    # print("normalized & filtered train = ")
    print(train.head())
    print()
    print(train.describe())

    # Merge on Id so that the submission set can have feature columns as well
    X_submission = pd.merge(train, testingSet, left_on='Id', right_on='Id')
    X_submission = X_submission.drop(columns=['Score_x'])
    X_submission = X_submission.rename(columns={'Score_y': 'Score'})

    # The training set is where the score is not null
    X_train =  train[train['Score'].notnull()]

    X_submission.to_csv("./data/X_submission.csv", index=False)
    X_train.to_csv("./data/X_train.csv", index=False)

In [29]:
def AnalyzeFeature(df, feature):
    # Group by 'Score' and calculate the average of the specified feature for each score
    avg_feature_by_score = df.groupby('Score')[feature].mean()

    print(f"Average {feature} by Score:")
    print(avg_feature_by_score)

    # Plot the average of the specified feature by score
    plt.figure(figsize=(8, 6))
    avg_feature_by_score.plot(kind='bar')
    plt.title(f'Average {feature} by Score')
    plt.xlabel('Score')
    plt.ylabel(f'Average {feature}')
    plt.show()

    # Correlation between the specified feature and 'Score'
    correlation = df[[feature, 'Score']].corr()

    print(f"Correlation between {feature} and Score:")
    print(correlation)

In [None]:
Features = ["Time", "HelpfulnessNumerator", "HelpfulnessDenominator", "Helpfulness", "ReviewLengthWords", "Sentiment", "Subjectivity", "VaderSentiment", 'SummarySentiment', 'SummarySubjectivity', 'SummaryVaderSentiment', 'meanProductScore', 'meanUserScore']
for ft in Features:
  AnalyzeFeature(X_train, ft)

# Sample + Split into training and testing set

In [4]:
# filter out num-numeric columns
Features = ["HelpfulnessNumerator", "HelpfulnessDenominator", "Helpfulness", "ReviewLengthWords", "Sentiment", "Subjectivity", "VaderSentiment", 'SummarySentiment', 'SummarySubjectivity', 'SummaryVaderSentiment', 'meanProductScore', 'meanUserScore']

X_train = X_train[Features + ["Score"]]
X_train = X_train.fillna(0)  # or use any appropriate method to handle NaNs

# Split training set into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(
    X_train.drop(columns=['Score']),
    X_train['Score'],
    test_size=1/4.0,
    random_state=0
)

# Print the initial distribution of Y_train
print("Original class distribution:")
print(Y_train.value_counts())

# Apply SMOTE for oversampling the minority classes
# smote = SMOTE(random_state=0)
# X_train_balanced, Y_train_balanced = smote.fit_resample(X_train, Y_train)

# # Print the new distribution of Y_train after balancing
# print("Balanced class distribution:")
# print(Y_train_balanced.value_counts())

# # Reduce the size of X_train and Y_train (for faster testing)
# X_train_balanced = X_train_balanced.sample(frac=0.1, random_state=0)  # Use 10% of the balanced training set
# Y_train_balanced = Y_train_balanced.loc[X_train_balanced.index]  # Ensure the labels match the reduced training set

# X_train = X_train.sample(frac=0.25, random_state=0)  # Use 10% of the balanced training set
# Y_train = Y_train.loc[X_train.index]  # Ensure the labels match the reduced training set

# Print the size of the reduced dataset and a sample
print(f"Reduced size: {X_train.shape}")

# Feature Selection

In [5]:
features = ["HelpfulnessNumerator", "HelpfulnessDenominator", "Helpfulness", "ReviewLengthWords", "Sentiment", "Subjectivity", "VaderSentiment", 'SummarySentiment', 'SummarySubjectivity', 'SummaryVaderSentiment', 'meanProductScore', 'meanUserScore']
# features = ['Sentiment', 'VaderSentiment', 'SummaryVaderSentiment', 'SummarySentiment',"HelpfulnessNumerator"]

X_test_select = X_test[features]
X_submission_select = X_submission[features]

X_train_select = X_train[features]
Y_train_select = Y_train
# X_train_select = X_train_balanced[features]
# Y_train_select = Y_train_balanced



# Model Creation

In [27]:
def predict_with_custom_thresholds(model, X_test, thresholds):
    # Get the predicted probabilities for each class
    probabilities = model.predict_proba(X_test)
    
    # Initialize an empty list to store predictions
    predictions = []

    # Loop through each sample in the test set
    for i in range(len(probabilities)):
        predicted_class = int(np.argmax(probabilities[i]))  # Default to the highest probability
        
        # Loop through each class and apply the custom thresholds
        for j in range(len(thresholds)):
            if probabilities[i][j] >= thresholds[j]:
                predicted_class = j  # Update predicted class based on threshold
                break
                
        predictions.append(predicted_class+1)  # Append the predicted class

    return predictions

In [46]:
# Learn the model
# model = KNeighborsClassifier(n_neighbors=9).fit(X_train_select, Y_train_select)
# m2 = LogisticRegression(max_iter=300, solver='saga').fit(X_train_select, Y_train_select)
# m4 = RandomForestClassifier(n_estimators=200, n_jobs=2).fit(X_train_select, Y_train_select)
# m5 = SVC(C=4, kernel='sigmoid').fit(X_train_select, Y_train_select)
m7 = GradientBoostingClassifier(learning_rate=0.50, n_estimators=300, random_state=0).fit(X_train_select, Y_train_select)


# Predict the score using the model
# Y_test_predictions = model.predict(X_test_select)
# y2 = m2.predict(X_test_select)
# y4 = m4.predict(X_test_select)
# y5 = m5.predict(X_test_select)
y7 = m7.predict(X_test_select)
# Custom thresholds for classes 1, 2, 3, 4, 5
# custom_thresholds = [0.3, 0.2, 0.2, 0.3, 0.45]  # Adjust as necessary
# Y_test_predictions_custom = predict_with_custom_thresholds(model, X_test_select, custom_thresholds)




# Model Evaluation

In [52]:
# custom_thresholds = [0.5, 0.5, 0.5, 0.5, 0.1]  # Adjust as necessary
# y7b = predict_with_custom_thresholds(m7, X_test_select, custom_thresholds)

In [53]:
# # Evaluate your model on the testing set
# accuracy = accuracy_score(Y_test, Y_test_predictions)
# print("Accuracy on testing set = ", accuracy)
# print()
# i = 1
# accs = [accuracy]
# for y in [y2, y4, y5, y7]:
#   a = accuracy_score(Y_test, y)
#   accs.append(a)
#   i+=1
# print(accs)

# # Plot a confusion matrix
# cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
# sns.heatmap(cm, annot=True)
# plt.title(f'Confusion matrix of the classifier\nAccuracy: {accuracy:.3f}')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()

# a4 = accuracy_score(Y_test, y4)
a7= accuracy_score(Y_test, y7)
# a7b = accuracy_score(Y_test, y7b)

# # Plot a confusion matrix
# cm = confusion_matrix(Y_test, y4, normalize='true')
# sns.heatmap(cm, annot=True)
# plt.title(f'Confusion matrix of the classifier\nAccuracy: {a4:.3f}')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()

# Plot a confusion matrix
cm = confusion_matrix(Y_test, y7, normalize='true')
sns.heatmap(cm, annot=True)
plt.title(f'Confusion matrix of the GB classifier\nAccuracy: {a7:.3f}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# cm = confusion_matrix(Y_test, y7b, normalize='true')
# sns.heatmap(cm, annot=True)
# plt.title(f'Confusion matrix of the GB classifier\nAccuracy: {a7b:.3f}')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()

# Create submission file

In [17]:
# average of missing mean vals
sub = X_submission_select.fillna(4.1)
# Create the submission file
X_submission['Score'] = m7.predict(sub)
submission = X_submission[['Id', 'Score']]
submission.to_csv("./data/submission.csv", index=False)