In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [18]:
data = pd.read_csv('Data/dacy_sentiment.csv')
sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
data['sentiment'] = data['sentiment'].replace(sentiment_map)
data

Unnamed: 0,title,body,rating,sentiment,predicted,prob_pos,prob_neu,prob_neg
0,Der var service på i dag,"Der var service på i dag, det var dejligt. Så...",5,1,positive,0.999,0.001,0.001
1,Et af verdens ringeste kurér firmaer,Hvorfor i alverden man vælger at drive et verd...,2,-1,neutral,0.003,0.994,0.003
2,Rigtig god information om levering😃,Rigtig god information om levering😃,5,1,positive,0.927,0.072,0.001
3,Meget hjælpsom chauffør,Meget hjælpsom chauffør. Pakken kom til den af...,5,1,positive,0.981,0.017,0.003
4,Pakken kom fuldstændig som forventet.,Pakken kom fuldstændig som forventet.,3,0,neutral,0.023,0.964,0.014
...,...,...,...,...,...,...,...,...
195,Ren og pæn og hurtig levering…,Ren og pæn og hurtig levering indstillet ved d...,5,1,positive,0.998,0.001,0.002
196,Hurtig og god behandling,Intet at udsætte !,5,1,negative,0.002,0.002,0.996
197,Dårligt firma,Fik en pakke afhentet 28/9.Nu er pakken forsvu...,1,-1,neutral,0.022,0.651,0.327
198,Hurtig levering,God forsendelse,5,1,positive,0.995,0.005,0.001


In [29]:
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', max_features=1000)
train_vectors = vectorizer.fit_transform(train_data['body'])
test_vectors = vectorizer.transform(test_data['body'])

clf = LogisticRegression(random_state=42)
clf.fit(train_vectors, train_data['sentiment'])

new_review = data.loc[1, 'body']
#new_review = "langsom og dårlig levering "
new_review_vector = vectorizer.transform([new_review])
predicted_sentiment = clf.predict(new_review_vector)[0]

#Tjekker hvor mange af hver der
sentiment_counts = data['sentiment'].value_counts()
print(sentiment_counts)

if predicted_sentiment == 1:
    print("Positive sentiment")
elif predicted_sentiment == 0:
    print("Neutral sentiment")
else:
    print("Negative sentiment")
    print("Actual: ", data.loc[1,'body'])

sentiment
 1    118
-1     76
 0      6
Name: count, dtype: int64
Negative sentiment
Actual:  Hvorfor i alverden man vælger at drive et verdens omspændende kurér/fragt firma når man så tydeligt ikke kan foredrage eller finde ud af at levere pakker til kunderne det forstår jeg simpelthen ikke.Har nu 4 gange forsøgt at få leveret en pakke ud til min kærestes adresse, og de 1-2 gange afleverer de ikke nogen seddel eller andet om at de har været der. Den 3 gang aftales det at pakken istedet skal afleveres hos hendes forældre (hvor der er nogen hjemme hele dagen) OG at pakken må stilles ved trappen HVIS der ikke var nogen hjemme: alligevel bliver pakken IKKE leveret denne gang heller. Nu prøver vi så igen på mandag, hvor der nu også skal hænge en seddel på døren til chaufføren om at pakken må stilles ved trappen (jeg ved så ikke hvorfor jeg allerede HAR givet den oplysning til fedEx manden) Det er kraftedeme for ringe at man ikke kan få dem til at ringe eller sms inden de ankommer, men "f

In [32]:
y_true = test_data['sentiment']
y_pred = clf.predict(test_vectors)

report = classification_report(y_true, y_pred, target_names=['Negative', 'Neutral', 'Positive'])
print(report)

              precision    recall  f1-score   support

    Negative       0.93      0.88      0.90        16
     Neutral       0.00      0.00      0.00         1
    Positive       0.92      1.00      0.96        23

    accuracy                           0.93        40
   macro avg       0.62      0.62      0.62        40
weighted avg       0.90      0.93      0.91        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
data = pd.read_csv('Data/dacy_sentiment.csv')

X_train, X_test, y_train, y_test = train_test_split(data['body'], data['sentiment'], test_size=0.2, random_state=42)

# trying to converting the text data to sequences
max_sequence_length = 1000
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['body'])
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

# making a architecture model for the process
model = Sequential()
model.add(Embedding(10000, 128, input_length=max_sequence_length))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# training part of the model
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(padded_sequences_train, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

y_pred = model.predict(padded_sequences_test)
y_pred = (y_pred > 0.5).astype(int)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
