In [2]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import xgboost as xgb

# Load and preprocess the text data
train_df = pd.read_csv("/kaggle/input/embedding/train.csv/train.csv")  # Replace "/path/to/training/dataset.csv" with the actual path to your training dataset file
test_df = pd.read_csv("/kaggle/input/embedding/test.csv/test.csv")  # Replace "/path/to/testing/dataset.csv" with the actual path to your testing dataset file

# Tokenize the text data
train_texts = train_df['review'].apply(lambda x: x.split())
test_texts = test_df['review'].apply(lambda x: x.split())

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_texts, vector_size=100, window=5, min_count=1, workers=4)

# Transform text data into numerical vectors using Word2Vec embeddings
def word_embeddings(text, model):
    embeddings = []
    for word in text:
        if word in model.wv:
            embeddings.append(model.wv[word])
    if len(embeddings) == 0:
        embeddings = [np.zeros(model.vector_size)]
    return np.mean(embeddings, axis=0)

X_train = np.array([word_embeddings(text, word2vec_model) for text in train_texts])
X_test = np.array([word_embeddings(text, word2vec_model) for text in test_texts])

# Encode labels
label_map = {'positive': 1, 'negative': 0}
train_labels = train_df['sentiment'].map(label_map)
test_labels = test_df['sentiment'].map(label_map)

# Initialize classifiers
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(),
    "LightGBM": lgb.LGBMClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500),
    "k-NN": KNeighborsClassifier(),
    "XGBoost": xgb.XGBClassifier()
}

# Train and evaluate classifiers
for name, classifier in classifiers.items():
    classifier.fit(X_train, train_labels)
    predictions = classifier.predict(X_test)
    accuracy = accuracy_score(test_labels, predictions)
    report = classification_report(test_labels, predictions, target_names=label_map.keys())
    print(f"{name} Accuracy:", accuracy)
    print(f"{name} Classification Report:\n{report}")


Decision Tree Accuracy: 0.6451
Decision Tree Classification Report:
              precision    recall  f1-score   support

    positive       0.65      0.63      0.64      9935
    negative       0.64      0.66      0.65     10065

    accuracy                           0.65     20000
   macro avg       0.65      0.65      0.64     20000
weighted avg       0.65      0.65      0.65     20000

Random Forest Accuracy: 0.75375
Random Forest Classification Report:
              precision    recall  f1-score   support

    positive       0.76      0.74      0.75      9935
    negative       0.75      0.77      0.76     10065

    accuracy                           0.75     20000
   macro avg       0.75      0.75      0.75     20000
weighted avg       0.75      0.75      0.75     20000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.80365
Logistic Regression Classification Report:
              precision    recall  f1-score   support

    positive       0.81      0.79      0.80      9935
    negative       0.80      0.82      0.81     10065

    accuracy                           0.80     20000
   macro avg       0.80      0.80      0.80     20000
weighted avg       0.80      0.80      0.80     20000

[LightGBM] [Info] Number of positive: 14935, number of negative: 15065
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497833 -> initscore=-0.008667
[LightGBM] [Info] Start training from score -0.008667
LightGBM Accuracy: 0.78095
LightGBM Classification Report:
              precision    recall  f1

In [3]:
# Add the necessary imports for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import xgboost as xgb

# Load and preprocess the text data
train_df = pd.read_csv("/kaggle/input/embedding/train.csv/train.csv")  # Replace "/path/to/training/dataset.csv" with the actual path to your training dataset file
test_df = pd.read_csv("/kaggle/input/embedding/test.csv/test.csv")  # Replace "/path/to/testing/dataset.csv" with the actual path to your testing dataset file

# Tokenize the text data
train_texts = train_df['review'].apply(lambda x: x.split())
test_texts = test_df['review'].apply(lambda x: x.split())

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_texts, vector_size=100, window=5, min_count=1, workers=4)

# Transform text data into numerical vectors using Word2Vec embeddings
def word_embeddings(text, model):
    embeddings = []
    for word in text:
        if word in model.wv:
            embeddings.append(model.wv[word])
    if len(embeddings) == 0:
        embeddings = [np.zeros(model.vector_size)]
    return np.mean(embeddings, axis=0)

X_train = np.array([word_embeddings(text, word2vec_model) for text in train_texts])
X_test = np.array([word_embeddings(text, word2vec_model) for text in test_texts])

# Encode labels
label_map = {'positive': 1, 'negative': 0}
train_labels = train_df['sentiment'].map(label_map)
test_labels = test_df['sentiment'].map(label_map)
# Define the classifiers with default hyperparameters
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "LightGBM": lgb.LGBMClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500),
    "k-NN": KNeighborsClassifier(),
    "XGBoost": xgb.XGBClassifier()
}

# Define hyperparameter grids for hyperparameter tuning
param_grids = {
    "Decision Tree": {"max_depth": [None, 10, 20, 30]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20, 30]},
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "LightGBM": {"num_leaves": [31, 50, 100], "max_depth": [-1, 10, 20, 30]},
    "Gradient Boosting": {"n_estimators": [100, 200, 300], "max_depth": [3, 5, 7]},
    "Neural Network": {"hidden_layer_sizes": [(50,), (100,), (200,)], "alpha": [0.0001, 0.001, 0.01]},
    "k-NN": {"n_neighbors": [3, 5, 7, 9]},
    "XGBoost": {"n_estimators": [100, 200, 300], "max_depth": [3, 5, 7]}
}

# Perform hyperparameter tuning and train classifiers
for name, classifier in classifiers.items():
    param_grid = param_grids.get(name, {})  # Get the corresponding hyperparameter grid
    grid_search = GridSearchCV(classifier, param_grid, cv=3, scoring="accuracy")
    grid_search.fit(X_train, train_labels)
    
    # Train the classifier with the best hyperparameters
    best_classifier = grid_search.best_estimator_
    best_classifier.fit(X_train, train_labels)
    
    # Evaluate the classifier
    predictions = best_classifier.predict(X_test)
    accuracy = accuracy_score(test_labels, predictions)
    report = classification_report(test_labels, predictions, target_names=label_map.keys())
    print(f"{name} Accuracy:", accuracy)
    print(f"{name} Classification Report:\n{report}")

Decision Tree Accuracy: 0.66285
Decision Tree Classification Report:
              precision    recall  f1-score   support

    positive       0.66      0.67      0.66      9935
    negative       0.67      0.66      0.66     10065

    accuracy                           0.66     20000
   macro avg       0.66      0.66      0.66     20000
weighted avg       0.66      0.66      0.66     20000

Random Forest Accuracy: 0.76105
Random Forest Classification Report:
              precision    recall  f1-score   support

    positive       0.77      0.73      0.75      9935
    negative       0.75      0.79      0.77     10065

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000

Logistic Regression Accuracy: 0.80655
Logistic Regression Classification Report:
              precision    recall  f1-score   support

    positive       0.81      0.79      0.80      9935
    negative      



Neural Network Accuracy: 0.80535
Neural Network Classification Report:
              precision    recall  f1-score   support

    positive       0.79      0.83      0.81      9935
    negative       0.83      0.78      0.80     10065

    accuracy                           0.81     20000
   macro avg       0.81      0.81      0.81     20000
weighted avg       0.81      0.81      0.81     20000

k-NN Accuracy: 0.7159
k-NN Classification Report:
              precision    recall  f1-score   support

    positive       0.70      0.75      0.72      9935
    negative       0.73      0.68      0.71     10065

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000

XGBoost Accuracy: 0.7923
XGBoost Classification Report:
              precision    recall  f1-score   support

    positive       0.80      0.77      0.79      9935
    negative       0.78      0.81      0.80     10065

    a