In [1]:
import pandas as pd

X_df = pd.read_feather('./preprocessed_data/x_new.feather')
y = pd.read_csv('./preprocessed_data/y_new.csv')
y = y['score']

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, stratify=y
)
X_train = X_train.dropna()
X_test = X_test.dropna()

import random

sgd = SGDClassifier(
    loss='log_loss',
    warm_start=True,
    n_jobs=-1
)
# sgd.partial_fit(X_train, y_train, classes=[0,1])
# for k in range(1000):
#     sgd.partial_fit(X_train.sample(100, random_state=k), y_train.sample(100, random_state=k), classes=[0,1,2])

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
accuracy_score(y_test, y_pred)

0.39416058394160586

In [4]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import SGDClassifier
# from sklearn.naive_bayes import MultinomialNB

# # Define individual models
# sgd = SGDClassifier(loss='log_loss', penalty='l2', max_iter=1000, tol=1e-3, random_state=42, n_jobs=-1)
# rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# nb = MultinomialNB()

# # Create an ensemble model
# ensemble = VotingClassifier(estimators=[
#     ('sgd', sgd),
#     ('rf', rf),
#     ('nb', nb)
# ], voting='hard', n_jobs=-1)

# # Train the ensemble model
# ensemble.fit(X_train, y_train)

# # Evaluate ensemble model
# y_pred = ensemble.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Ensemble model accuracy: {accuracy:.2f}')

In [4]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Define individual models
sgd = SGDClassifier(loss='log_loss', penalty='l2', max_iter=1000, tol=1e-3, random_state=42, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
nb = MultinomialNB()

# Create an ensemble model
ensemble = VotingClassifier(estimators=[
    ('sgd', sgd),
    ('rf', rf),
    ('nb', nb)
], voting='hard', n_jobs=-1)

# Define parameter grids for each model
param_grid = {
    'sgd__alpha': [0.0001, 0.001, 0.01, 0.1],
    'sgd__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'sgd__eta0': [0.001, 0.01, 0.1],
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, 30, None],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'nb__alpha': [0.5, 1.0, 1.5, 2.0]
}

# Initialize GridSearchCV
grid_search = RandomizedSearchCV(estimator=ensemble, param_distributions=param_grid, n_iter=250,
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_ensemble = grid_search.best_estimator_

# Evaluate the model
y_pred = best_ensemble.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble model accuracy after tuning: {accuracy:.2f}')
print(f'Best parameters: {best_params}')

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
[CV] END nb__alpha=0.5, rf__max_depth=20, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__min_samples_split=2, rf__n_estimators=100, sgd__alpha=0.1, sgd__eta0=0.01, sgd__learning_rate=invscaling; total time=   1.8s
[CV] END nb__alpha=0.5, rf__max_depth=20, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__min_samples_split=2, rf__n_estimators=100, sgd__alpha=0.1, sgd__eta0=0.01, sgd__learning_rate=invscaling; total time=   1.8s
[CV] END nb__alpha=0.5, rf__max_depth=20, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__min_samples_split=2, rf__n_estimators=100, sgd__alpha=0.1, sgd__eta0=0.01, sgd__learning_rate=invscaling; total time=   1.9s
[CV] END nb__alpha=1.5, rf__max_depth=10, rf__max_features=sqrt, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=200, sgd__alpha=0.1, sgd__eta0=0.01, sgd__learning_rate=constant; total time=   2.6s
[CV] END nb__alpha=1.5, rf__max_depth=10, rf__max_features=sqrt, r

In [None]:
from transformers import pipeline

dssc = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)

dssc(['A3 é como se fosse terceira divisão?', 'Vai tomar no cu'])

[[{'label': 'positive', 'score': 0.24669073522090912},
  {'label': 'neutral', 'score': 0.2767774760723114},
  {'label': 'negative', 'score': 0.47653183341026306}],
 [{'label': 'positive', 'score': 0.2579444646835327},
  {'label': 'neutral', 'score': 0.4240570366382599},
  {'label': 'negative', 'score': 0.3179984390735626}]]