In [1]:
import pandas as pd
import os

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from pickle import dump, load

In [2]:
df = pd.read_csv(os.path.join('storage', 'base_sentiment.1.0.0.csv'))

In [3]:
df

Unnamed: 0,_id,text,created_at,sentiment
0,64f7bba758ec2b013e226a67,"Mais uma vez, o Sr. Costner arrumou um filme p...",2023-09-05T20:37:08.869Z,negative
1,64f7bba758ec2b013e226a68,Este é um exemplo do motivo pelo qual a maiori...,2023-09-05T20:37:08.869Z,negative
2,64f7bba758ec2b013e226a69,"Primeiro de tudo eu odeio esses raps imbecis, ...",2023-09-05T20:37:08.869Z,negative
3,64f7bba758ec2b013e226a6a,Nem mesmo os Beatles puderam escrever músicas ...,2023-09-05T20:37:08.869Z,negative
4,64f7bba758ec2b013e226a6b,Filmes de fotos de latão não é uma palavra apr...,2023-09-05T20:37:08.869Z,negative
...,...,...,...,...
49454,64f7bba758ec2b013e232b95,"Como a média de votos era muito baixa, e o fat...",2023-09-05T20:37:09.428Z,positive
49455,64f7bba758ec2b013e232b96,O enredo teve algumas reviravoltas infelizes e...,2023-09-05T20:37:09.428Z,positive
49456,64f7bba758ec2b013e232b97,Estou espantado com a forma como este filme e ...,2023-09-05T20:37:09.428Z,positive
49457,64f7bba758ec2b013e232b98,A Christmas Together realmente veio antes do m...,2023-09-05T20:37:09.428Z,positive


In [4]:
df_positive = df[df['sentiment']=='positive']
df_negative = df[df['sentiment']=='negative']
df_review_imb = pd.concat([df_positive, df_negative])

In [5]:
df_review_imb

Unnamed: 0,_id,text,created_at,sentiment
12389,64f7bba758ec2b013e229acc,Eu fui e vi este filme ontem à noite depois de...,2023-09-05T20:37:09.010Z,positive
12390,64f7bba758ec2b013e229acd,"O diretor do ator, Bill Paxton, segue sua prom...",2023-09-05T20:37:09.010Z,positive
12391,64f7bba758ec2b013e229ace,Como um jogador de recreio com algum conhecime...,2023-09-05T20:37:09.010Z,positive
12392,64f7bba758ec2b013e229acf,"Eu vi esse filme em uma prévia, e é delicioso....",2023-09-05T20:37:09.010Z,positive
12393,64f7bba758ec2b013e229ad0,Bill Paxton levou a verdadeira história do gol...,2023-09-05T20:37:09.010Z,positive
...,...,...,...,...
37113,64f7bba758ec2b013e22fb60,"No final do filme, senti que era muito técnico...",2023-09-05T20:37:09.284Z,negative
37114,64f7bba758ec2b013e22fb61,Este é o tipo de filme que meus inimigos me as...,2023-09-05T20:37:09.284Z,negative
37115,64f7bba758ec2b013e22fb62,Eu vi Descent na noite passada no Stockholm Fi...,2023-09-05T20:37:09.284Z,negative
37116,64f7bba758ec2b013e22fb63,Alguns filmes que você escolhe por um quilo sã...,2023-09-05T20:37:09.284Z,negative


In [6]:
rus = RandomUnderSampler(random_state=0)
df_review_bal, df_review_bal['sentiment']=rus.fit_resample(df_review_imb[['text']], df_review_imb['sentiment'])
df_review_bal

Unnamed: 0,text,sentiment
34593,"Bem, não há trama real para falar, é apenas um...",negative
8119,Eu estava assistindo TV um dia com um amigo e ...,negative
2931,"Westerns estilizados de Hollywood, cheios de c...",negative
28793,"Como um turco que agora vive na Suécia, devo c...",negative
34643,Este filme foi o pior filme já feito no planet...,negative
...,...,...
49454,"Como a média de votos era muito baixa, e o fat...",positive
49455,O enredo teve algumas reviravoltas infelizes e...,positive
49456,Estou espantado com a forma como este filme e ...,positive
49457,A Christmas Together realmente veio antes do m...,positive


In [7]:
train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

train_x, train_y = train['text'], train['sentiment']
test_x, test_y = test['text'], test['sentiment']

In [8]:
tfidf = TfidfVectorizer(stop_words=stopwords.words('portuguese'))
train_x_vector = tfidf.fit_transform(train_x)

pd.DataFrame.sparse.from_spmatrix(train_x_vector,index=train_x.index,columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,0000000000001,0000001,00001,00015,000dm,000wtf,001,003830,...,úteis,útero,útil,über,übermensch,übermenschlich,četvorka,œaberrações,ʻo,ʻos
24137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
test_x_vector = tfidf.transform(test_x)

In [10]:
models = dict(dec=DecisionTreeClassifier(), reg=LogisticRegression())

# svc = SVC(kernel='linear')
# dec = DecisionTreeClassifier()
# gnb = GaussianNB()
# reg = LogisticRegression()


In [11]:
# svc.fit(train_x_vector, train_y)
# dec.fit(train_x_vector, train_y)
# reg.fit(train_x_vector, train_y)


for key in models:
    models[key].fit(train_x_vector, train_y)
    print(f"Model {key} finished!")

    print(f"Saving model {key} local")
    dump(models[key], open(os.path.join('models', f'{key}.1.0.0.sav'), 'wb'))

Model dec finished!
Saving model dec local
Model reg finished!
Saving model reg local


In [21]:
sentences = ['Um bom filme!', 'eu não gostei do que a Rebecca fez hoje', "que merda!", "estou bem mal"]
local_models = os.listdir(os.path.join('models'))

for model in local_models:

    model_loaded = load(open(os.path.join('models', model), 'rb'))
    for sentence in sentences:
        input_text = tfidf.transform([sentence])
        
        predict = model_loaded.predict(input_text)[0]
        predict_proba =model_loaded.predict_proba(input_text)
        print(f'model:"{model}" predict:"{predict}" text:"{sentence}" score:"{predict_proba}"')
    print()


model:"dec.1.0.0.sav" predict:"positive" text:"Um bom filme!" score:"[[0. 1.]]"
model:"dec.1.0.0.sav" predict:"positive" text:"eu não gostei do que a Rebecca fez hoje" score:"[[0. 1.]]"
model:"dec.1.0.0.sav" predict:"positive" text:"que merda!" score:"[[0. 1.]]"
model:"dec.1.0.0.sav" predict:"positive" text:"estou bem mal" score:"[[0. 1.]]"

model:"reg.1.0.0.sav" predict:"positive" text:"Um bom filme!" score:"[[0.03378358 0.96621642]]"
model:"reg.1.0.0.sav" predict:"positive" text:"eu não gostei do que a Rebecca fez hoje" score:"[[0.03790792 0.96209208]]"
model:"reg.1.0.0.sav" predict:"negative" text:"que merda!" score:"[[0.8429579 0.1570421]]"
model:"reg.1.0.0.sav" predict:"negative" text:"estou bem mal" score:"[[0.9501437 0.0498563]]"



In [26]:
# svc_score = svc.score(test_x_vector, test_y)
# dec_score = dec.score(test_x_vector, test_y)
# gnb_score = gnb.score(test_x_vector.toarray(), test_y)
# log_reg_score = reg.score(test_x_vector, test_y)

for model in local_models:
    model_loaded = load(open(os.path.join('models', model), 'rb'))
    print(f'model:{model}: score: {model_loaded.score(test_x_vector, test_y)}')

model:dec.1.0.0.sav: score: 0.705810172403215
model:reg.1.0.0.sav: score: 0.8876618197435425


In [20]:
# svc_score, dec_score,log_reg_score

(0.8887661819743543, 0.7103503282409964, 0.8876618197435425)

In [17]:
# conf_mat = confusion_matrix(test_y,
#                             svc.predict(test_x_vector),
#                             labels=['positive', 'negative'])

In [19]:
# conf_mat

array([[7314,  811],
       [1002, 7172]])

In [21]:
parameters = {'C': [1,4,8,16,32] ,'kernel':['linear', 'rbf']}
svc = SVC()
svc_grid = GridSearchCV(svc,parameters, cv=5)
svc_grid

In [22]:
svc_grid.fit(train_x_vector, train_y)

In [None]:
print(svc_grid.best_estimator_)

In [None]:
svc2 = SVC(C = 1, kernel='linear')
svc2.fit(train_x_vector, train_y)

In [None]:
conf_mat2 = confusion_matrix(test_y,
                            svc2.predict(test_x_vector),
                            labels=['positive', 'negative'])

In [27]:
'name.0.0.0.save'.split('.')

['name', '0', '0', '0', 'save']

In [30]:
'.'.join('name.0.0.0.save'.split('.')[1:4])

'0.0.0'