#  Etapa de entrenamiento y testeo de un modelo de análisis de sentimiento

In [1]:
# Import the libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
df = pd.read_csv('final.csv')

In [3]:
df.head()

Unnamed: 0,overall,reviewText
0,4,"['pretty', 'good', 'game', 'daughter', 'love',..."
1,5,"['first', 'let', 'clear', 'one', 'thing', 'rem..."
2,1,"['blue', 'button', 'stopped', 'working', 'quic..."
3,2,"['never', 'played', 'game', 'sc', 'series', 't..."
4,2,"['no', 'no']"


In [4]:
# map star ratings to sentiment labels (0 = negative, 1 = positive)
etiqueta = {1: 0, 2: 0, 4: 1, 5: 1}
df['etiqueta'] = df['overall'].map(etiqueta)

X = df['reviewText']
y = df['etiqueta']

# Split the data into training and testing 

In [5]:
# Split the data into training data (80%) and testing data (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Term Frequency-Inverse Document Frequency

Explanation of the choice of parameters for tfidfVectorizer:


*   min_df : set min_df to a higher value to exclude some of the less frequent words and potentially reduce noise in the data 
*   max_df : since we have seen that the average review is quite verbose, we decided to avoid using words that might be considered context-specific stopwords. 
*   ngram_range : we have set this feature to a serie of different values from up to 5 and tried it on a basic model. This value seems to be the best fit. 
* max_feature : with a cardinality of more than 40 000 this parameter had to be set quite high. We settled on 2000 since it had better result than at 500 (81% F1-score) and 1000 (83% F1-score). Setting it higher seemed to create too much computational demand and might as well lead to overfitting. 



In [6]:
vectorizer = TfidfVectorizer(min_df=50, max_df=0.90, ngram_range=(1,4), max_features=2000)
vectorizer.fit(X) 
X_train_tf = vectorizer.transform(X_train)  # Taking X as input and converting into feature matrix(numerical values)
X_train_tf = X_train_tf.todense()
X_test_tf = vectorizer.transform(X_test)  # Taking X as input and converting into feature matrix(numerical values)
X_test_tf = X_test_tf.todense()

# First model: SVC

In [7]:
# Instantiate SVM model with basic hyperparameters
svc = SVC(kernel='linear', C=1, random_state=42)

# Train SVM model on the training set
svc.fit(X_train_tf, y_train)

# Use the trained SVM model to predict sentiment of test set
y_pred = svc.predict(X_test_tf)

# Evaluate performance of SVM model using accuracy, precision, recall, and F1-score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)



Accuracy: 0.8435
Precision: 0.8508728179551123
Recall: 0.839153959665519
F1-score: 0.844972758791481


In [11]:
# Creating a function to find the best parameters
def svc_tuning(X_train, y_train, X_test, y_test):
    # Set the parameters for grid search
    param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['poly', 'rbf', 'sigmoid', 'linear']}
    
    # Create a SVM classifier object
    svc = SVC()
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    
    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)
    
    # Use the best model from the grid search to make prediticons on the test data
    y_pred = grid_search.predict(X_test)
    
    # Calculate and print the accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print('Best parameters:', grid_search.best_params_)
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1-score:', f1)

In [12]:
svc_tuning(X_train_tf[0:2000], y_train[0:2000], X_test_tf, y_test)



Best parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.81525
Precision: 0.8355809128630706
Recall: 0.7924249877029022
F1-score: 0.8134309517798536


In [13]:
# Instantiate SVM model with best hyperparameters
svc = SVC(kernel='rbf', C=1, gamma=1, random_state=42)

# Train SVM model on the training set
svc.fit(X_train_tf, y_train)

# Use the trained SVM model to predict sentiment of test set
y_pred = svc.predict(X_test_tf)

# Evaluate performance of SVM model using accuracy, precision, recall, and F1-score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)



Accuracy: 0.85275
Precision: 0.8553149606299213
Recall: 0.8548942449581899
F1-score: 0.8551045510455104


# Second model: XGBoost

In [None]:
# Define the XGBoost model
xgb_model = XGBClassifier()

# Define the parameter grid
params = {
    "learning_rate": [0.1, 0.01, 0.001],
    "max_depth": [3, 5, 7],
    "subsample": [0.5, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.8, 1.0],
    "gamma": [0.01, 0.1, 1.0]
}

# Define the grid search
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=3
)

# Fit the grid search to the training data
grid_search.fit(X_train_tf[:2000], y_train[:2000])

# Print the best parameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters: {'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.5}
Best score: 0.7799999999999999


In [19]:
# Define XGBoost model
xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=7,
    learning_rate=0.1,
    subsample=0.5,
    colsample_bytree=0.5,
    gamma = 0.1, 
    random_state=42
)

# Train XGBoost model
xgb_model.fit(X_train_tf, y_train)

# Evaluate XGBoost model
y_pred = xgb_model.predict(X_test_tf)
accuracy = accuracy_score(y_test, y_pred)

In [20]:
print(accuracy)

0.853
