In [None]:
import pandas as pd

# Step 0: Data preparation

In [None]:
df = pd.read_csv('reviews2.csv')
df = df.rename(columns={'Rating': 'Score', 'Reviews': 'Text'}).reset_index().rename(columns={'index': 'Id'})
print(df.describe())
df.head()

In [None]:
df = df[(df['Score'] != 3) & (df['Text'].str.len() < 500)]
df = df[df['Review Votes'] > 10]
df =df[~df.duplicated(subset=['Text'], keep='first')]

len(df)

In [None]:
df['Score'] = ['neg' if score < 3 else 'pos' for score in df['Score']]
df = df[['Id', 'Text', 'Score']]
df.head()

In [None]:
# Check for null values in 'Score' column
score_null_count = df['Score'].isnull().sum()
if score_null_count > 0:
    print(f"Number of null values in 'Score' column: {score_null_count}")
else:
    print("No null values found in 'Score' column")

# Check for null values in 'Text' column
text_null_count = df['Text'].isnull().sum()
if text_null_count > 0:
    print(f"Number of null values in 'Text' column: {text_null_count}")
else:
    print("No null values found in 'Text' column")

df = df.dropna().reset_index()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count the occurrences of each score
score_counts = df['Score'].value_counts()

# Create bar plot using seaborn
plt.figure(figsize=(8, 6))
sns.barplot(x=score_counts.index, y=score_counts.values)
plt.xlabel('Score')
plt.ylabel('Count')
plt.title('Count of Scores')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df = df.dropna()
df['sentiment_encoded'] = label_encoder.fit_transform(df['Score'])

# Step 1: Sentiment analysis

### VADER approach

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
example = df['Text'][10]
example

In [None]:
sid.polarity_scores(example)

In [None]:
df['Score'][10]

In [None]:
vader_res = df['Text'].apply(lambda text: sid.polarity_scores(text))
df['vader_pos'] = vader_res.apply(lambda score: score['pos'])
df['vader_neg'] = vader_res.apply(lambda score: score['neg'])
df['vader_neu'] = vader_res.apply(lambda score: score['neu'])
df['vader_compound'] = vader_res.apply(lambda score: score['compound'])

df.head()

In [None]:
ax = sns.barplot(data=df, x='Score', y='vader_compound', order=['neg', 'pos'])
ax.set_title('Compound score by stars')
plt.show()

In [None]:
def plot_sentiment_results(model):
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    sns.barplot(data=df, x='Score', y=f'{model}_pos', ax=axs[0], order=['neg', 'pos'])
    sns.barplot(data=df, x='Score', y=f'{model}_neu', ax=axs[1], order=['neg', 'pos'])
    sns.barplot(data=df, x='Score', y=f'{model}_neg', ax=axs[2], order=['neg', 'pos'])
    axs[0].set_title('Positive')
    axs[1].set_title('Neutral')
    axs[2].set_title('Negative')
    plt.show()

plot_sentiment_results('vader')

### RoBERTa pretrained model

In [None]:
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL = 'cardiffnlp/twitter-roberta-base-sentiment'

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def polarity_scores_roberta(text):
    encoded_inputs = tokenizer(text, return_tensors='pt')
    logits = model(**encoded_inputs)

    scores = logits[0][0].detach().numpy()
    scores = softmax(scores)
    return {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }

In [None]:
from tqdm.notebook import tqdm

roberta_res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    row_id = row['Id']
    row_text = row['Text']
    score = polarity_scores_roberta(row_text)
    roberta_res[row_id] = score

In [None]:
results_df = pd.DataFrame(roberta_res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
df = df.merge(results_df, how='left')
df.head()

In [None]:
plot_sentiment_results('roberta')

# Step 2: Regression
### Linear regression


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def linear_reg(df, model_name):
    x_train = df[[f'{model_name}_pos', f'{model_name}_neu', f'{model_name}_neg']]
    y_train = df['sentiment_encoded']

    model = LinearRegression()
    model.fit(x_train, y_train)

    return model

In [None]:
def predict_reg(df, model, model_name):
    return model.predict(df[[f'{model_name}_pos', f'{model_name}_neu', f'{model_name}_neg']])

In [None]:
def test_reg(model_name, y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Squared Error ({model_name}):', mse)
    print(f'R-squared ({model_name}):', r2)

In [None]:
model = linear_reg(df_train, 'vader')
df_train['vader_linear_reg'] = predict_reg(df_train, model, 'vader').clip(min=0, max=1)
df_test['vader_linear_reg'] = predict_reg(df_test, model, 'vader').clip(min=0, max=1)
test_reg('vader', df_test['sentiment_encoded'], df_test['vader_linear_reg'])

In [None]:
model = linear_reg(df_train, 'roberta')
df_train['roberta_linear_reg'] = predict_reg(df_train, model, 'roberta').clip(min=0, max=1)
df_test['roberta_linear_reg'] = predict_reg(df_test, model, 'roberta').clip(min=0, max=1)
test_reg('roberta', df_test['sentiment_encoded'], df_test['roberta_linear_reg'])

### Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

def poly_reg(df, model_name, degree):
    x_train = df[[f'{model_name}_pos', f'{model_name}_neu', f'{model_name}_neg']]
    y_train = df['sentiment_encoded']

    polynomial_features = PolynomialFeatures(degree=degree)
    x_train_poly = polynomial_features.fit_transform(x_train)

    model = LinearRegression()
    model.fit(x_train_poly, y_train)

    return model

In [None]:
def predict_poly_reg(df, model, model_name, degree):
    x_train = df[[f'{model_name}_pos', f'{model_name}_neu', f'{model_name}_neg']]

    polynomial_features = PolynomialFeatures(degree=degree)
    x_train_poly = polynomial_features.fit_transform(x_train)

    return model.predict(x_train_poly)

In [None]:
model = poly_reg(df_train, 'vader', 2)
df_train['vader_poly_reg'] = predict_poly_reg(df_train, model, 'vader', 2).clip(min=0, max=1)
df_test['vader_poly_reg'] = predict_poly_reg(df_test, model, 'vader', 2).clip(min=0, max=1)
test_reg('vader', df_test['sentiment_encoded'], df_test['vader_poly_reg'])

In [None]:
model = poly_reg(df_train, 'roberta', 2)
df_train['roberta_poly_reg'] = predict_poly_reg(df_train, model, 'roberta', 2).clip(min=0, max=1)
df_test['roberta_poly_reg'] = predict_poly_reg(df_test, model, 'roberta', 2).clip(min=0, max=1)
test_reg('roberta', df_test['sentiment_encoded'], df_test['roberta_poly_reg'])

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def log_reg(df, model_name):
    x_train = df[[f'{model_name}_pos', f'{model_name}_neu', f'{model_name}_neg']]
    y_train = df['sentiment_encoded']

    # Scale the features
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)

    # Train SVR regression model
    model = LogisticRegression()
    model.fit(x_train_scaled, y_train)

    return model

In [None]:
def predict_scaled_reg(df, model, model_name):
    x_train = df[[f'{model_name}_pos', f'{model_name}_neu', f'{model_name}_neg']]

    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)

    return model.predict(x_train_scaled)

In [None]:
model = log_reg(df_train, 'vader')
df_train['vader_log_reg'] = predict_scaled_reg(df_train, model, 'vader')
df_test['vader_log_reg'] = predict_scaled_reg(df_test, model, 'vader')
test_reg('vader', df_test['sentiment_encoded'], df_test['vader_log_reg'])

In [None]:
model = log_reg(df_train, 'roberta')
df_train['roberta_log_reg'] = predict_scaled_reg(df_train, model, 'roberta')
df_test['roberta_log_reg'] = predict_scaled_reg(df_test, model, 'roberta')
test_reg('roberta', df_test['sentiment_encoded'], df_test['roberta_log_reg'])

# Step 3: Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from scipy.sparse import hstack

In [None]:
x_train = df_train[['roberta_neg', 'roberta_neu', 'roberta_pos', 'roberta_linear_reg', 'roberta_poly_reg', 'roberta_log_reg']]
x_test = df_test[['roberta_neg', 'roberta_neu', 'roberta_pos', 'roberta_linear_reg', 'roberta_poly_reg', 'roberta_log_reg']]

train_texts = df_train['Text']
train_labels = df_train['Score']

test_texts = df_test['Text']
test_labels = df_test['Score']

### Ensemble

In [None]:
# Create a TfidfVectorizer to convert text into a matrix of token counts
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)

# Combine the test features with the train features
train_combined_features = hstack((train_features, x_train))
test_combined_features = hstack((test_features, x_test))

# Define the individual classifiers
classifier1 = MultinomialNB()
classifier2 = DecisionTreeClassifier()
classifier3 = SVC()

# Create a VotingClassifier with the individual classifiers
voting_classifier = VotingClassifier(
    estimators=[('nb', classifier1), ('dtc', classifier2), ('svm', classifier3)],
    voting='hard'
)

# Train the VotingClassifier
voting_classifier.fit(train_combined_features, train_labels)

# Make predictions on the test data
predictions = voting_classifier.predict(test_combined_features)
df_test['voting_classifier'] = predictions

# Calculate and print the accuracy of the classifier
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, pos_label='pos')
recall = recall_score(test_labels, predictions, pos_label='pos')
f1 = f1_score(test_labels, predictions, pos_label='pos')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

In [None]:
from sklearn.metrics import silhouette_score, adjusted_rand_score

silhouette = silhouette_score(x_test, predictions)
adjusted_rand = adjusted_rand_score(test_labels, predictions)

print(f'Silhouette Score:', silhouette)
print(f'Adjusted Rand Index:', adjusted_rand)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def plot_2d_projection(label):
    x = df_test[[f'roberta_neg', f'roberta_neu', f'roberta_pos', f'roberta_poly_reg',]]

    pca = PCA(n_components=2)
    x_2d = pca.fit_transform(x)

    df_plot = pd.DataFrame({'PC1': x_2d[:, 0], 'PC2': x_2d[:, 1], 'Cluster': df_test[label]})

    sns.set(style='whitegrid')

    sns.scatterplot(data=df_plot, x='PC1', y='PC2', hue='Cluster', palette='Set1', hue_order=['neg', 'pos'])
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(f'2D Projection of Classification Features model - Clustering based on {label}')
    plt.show()

plot_2d_projection('Score')
plot_2d_projection('voting_classifier')

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(label):
    cm = confusion_matrix(df_test[label], df_test['Score'])

    class_labels = ['neg', 'pos']

    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=class_labels, yticklabels=class_labels)

    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(f'Confusion Matrix ({label})')

    plt.show()

plot_confusion_matrix('voting_classifier')