In [None]:
import numpy as np
import random
import re
import nltk
from tabulate import tabulate
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import spacy

nlp = spacy.load('es_core_news_sm')


: 

In [None]:
def extract_lines(corpus):
    X = list()
    y = list()
    for i in range(2, 4381):
        try:
            fname = corpus + str(i) + '.xml'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                content = "".join(content)
                bs_content = bs(content, "lxml")
                review = bs_content.find("review")
                rank = review.get("rank")
                y.append(int(rank))
            
            fname = corpus + str(i) + '.review.pos'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                review_pos_aux = list()
                              
                for line in content:
                    if line != '\n':
                        line_ls = line.split()
                        review_pos_aux.append(line_ls[1])
                X.append(' '.join(review_pos_aux))
            
        except IOError:
            #print("Could not read file:", fname)
            pass
    
    return X, y

: 

In [None]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

: 

In [None]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

: 

In [None]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('spanish')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

: 

In [None]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

: 

In [None]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

: 

In [None]:
X, y = extract_lines('/Users/vaps/Downloads/corpusCriticasCine/corpusCriticasCine/')

: 

In [None]:
tokenized_X = tokenize_lines_by_words(X)

: 

In [None]:
new_X = clean_alphabetic_text_lines(tokenized_X)

: 

In [None]:
clean_X = remove_stop_words(new_X)

: 

In [None]:
data = list(zip(clean_X, y))

: 

In [None]:
random.shuffle(data)

: 

In [None]:
clean_X, y = zip(*data)

: 

In [None]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

: 

In [None]:
X_counts = count_vect.fit_transform(clean_X)
X_tfidf = tfidf_transformer.fit_transform(X_counts)

: 

In [None]:
y = np.array(y)


: 

In [None]:
tfidf_transformer = TfidfTransformer()

: 

In [None]:
X_tdidf = tfidf_transformer.fit_transform(X_counts)

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tdidf, y, test_size = 0.01, random_state = 42, stratify = y)

: 

In [None]:
# Logistic Regression
logreg = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='multinomial', max_iter=10000)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

print(metrics.classification_report(y_test, y_pred))

# Conjunto de entrenamiento
y_pred_train = logreg.predict(X_train)

print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(logreg.score(X_train, y_train)))

# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)

print(metrics.classification_report(y_train, y_pred_train))

: 

In [None]:
# Multinomial Naive Bayes

clf = MultinomialNB().fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy of Multinomial Naive Bayes classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

# Confusion Matrix

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

print(metrics.classification_report(y_test, y_pred))

# Conjunto de entrenamiento

y_pred_train = clf.predict(X_train)

print('Accuracy of Multinomial Naive Bayes classifier on train set: {:.2f}'.format(clf.score(X_train, y_train)))

# Confusion Matrix

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_train, y_pred_train)

print(confusion_matrix)

print(metrics.classification_report(y_train, y_pred_train))



: 

In [None]:
# SVM

#Import SVM

from sklearn import svm

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy of SVM classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

# Confusion Matrix

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

print(metrics.classification_report(y_test, y_pred))

# Conjunto de entrenamiento

y_pred_train = clf.predict(X_train)

print('Accuracy of SVM classifier on train set: {:.2f}'.format(clf.score(X_train, y_train)))

# Confusion Matrix

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_train, y_pred_train)

print(confusion_matrix)

print(metrics.classification_report(y_train, y_pred_train))



: 

In [None]:
# Obtain F1 score of .70 or higher

# Logistic Regression

y_pred = logreg.predict(X_test)

print('F1 score of logistic regression classifier on test set: {:.2f}'.format(metrics.f1_score(y_test, y_pred, average='weighted')))

# Multinomial Naive Bayes

y_pred = clf.predict(X_test)

print('F1 score of Multinomial Naive Bayes classifier on test set: {:.2f}'.format(metrics.f1_score(y_test, y_pred, average='weighted')))

# SVM

y_pred = clf.predict(X_test)

print('F1 score of SVM classifier on test set: {:.2f}'.format(metrics.f1_score(y_test, y_pred, average='weighted')))

: 

In [None]:
from sklearn.model_selection import KFold, GridSearchCV

# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l2']}

# Create a cross-validation object
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

# Create a grid search object
grid = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=kfold, scoring='f1_micro')

# Fit the grid search object to the training data
grid.fit(X_train, y_train)

# Select the best hyperparameters
best_C = grid.best_params_['C']
best_penalty = grid.best_params_['penalty']

# Train a final model with the best hyperparameters
logreg_best = LogisticRegression(C=best_C, penalty=best_penalty, solver='saga', multi_class='multinomial', max_iter=1000)
logreg_best.fit(X_train, y_train)

# Evaluate the final model on the test set
y_pred = logreg_best.predict(X_test)
f1 = metrics.f1_score(y_test, y_pred, average='micro')
print('Final f1-score on test set: {:.2f}'.format(f1))


: 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_clean = [' '.join(text) for text in X_clean]  # concatenate tokens into a single string
X_vectorized = vectorizer.fit_transform(X_clean)

model = SVC()
model.fit(X_vectorized, y)

# Print scores
print('Training set score: {:.2f}'.format(model.score(X_vectorized, y)))

# Print the number of features
print('Number of features: {}'.format(len(vectorizer.get_feature_names())))

: 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


param_grid = {
    'vectorizer__min_df': [1, 2, 3],
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf']
}

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', SVC())
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_clean, y)

# Print the RESULTS

print('Best parameters: {}'.format(grid_search.best_params_))
print('Best cross-validation score: {:.2f}'.format(grid_search.best_score_))

: 

: 