In [1]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import nltk
from nltk.corpus import stopwords
import time


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def preprocess_text_for_embedding(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return filtered_tokens

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [6]:
train_df['tokens'] = train_df['review'].apply(preprocess_text_for_embedding)
test_df['tokens'] = test_df['review'].apply(preprocess_text_for_embedding)


In [7]:
analyzer = SentimentIntensityAnalyzer()


In [8]:
train_df['vader_score'] = train_df['tokens'].apply(lambda tokens: analyzer.polarity_scores(' '.join(tokens))['compound'])
test_df['vader_score'] = test_df['tokens'].apply(lambda tokens: analyzer.polarity_scores(' '.join(tokens))['compound'])


In [9]:
word2vec_model = Word2Vec(sentences=train_df['tokens'], vector_size=300, window=8, min_count=5, workers=4, sg=1, epochs=20)

def document_vector(word2vec_model, doc):
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)


X_train_embeddings = np.array([document_vector(word2vec_model, words) for words in train_df['tokens']])
X_test_embeddings = np.array([document_vector(word2vec_model, words) for words in test_df['tokens']])



In [10]:
X_train_vader = train_df['vader_score'].values.reshape(-1, 1)
X_test_vader = test_df['vader_score'].values.reshape(-1, 1)

X_train_combined = np.hstack((X_train_embeddings, X_train_vader))
X_test_combined = np.hstack((X_test_embeddings, X_test_vader))


In [11]:
y_train = train_df['sentiment']
y_test = test_df['sentiment']


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y_train)

y_train_encoded = label_encoder.transform(y_train)

In [13]:
y_test_encoded = label_encoder.transform(y_test)


In [15]:
!pip install lightgbm
import lightgbm as lgb



In [16]:

model = lgb.LGBMClassifier()
model.fit(X_train_combined, y_train_encoded)



predictions = model.predict(X_test_combined)




accuracy = accuracy_score(y_test_encoded, predictions)
print("Accuracy on test set:", accuracy)

[LightGBM] [Info] Number of positive: 14935, number of negative: 15065
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.331743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76755
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 301
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497833 -> initscore=-0.008667
[LightGBM] [Info] Start training from score -0.008667
Accuracy on test set: 0.87045
