In [1]:
import pandas as pd

from ocrfixr import spellcheck

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv('../Dataset/EDOS 1M.csv')
dataset.head()

Unnamed: 0,dialogue_id,turn,uttr,eb+_emot,label_confidence
0,97,1,You moron ! What fool washes diapers by the we...,angry,0.437522
1,97,2,You useless fool !,furious,0.731564
2,99,1,How dare you sleep !,furious,0.605636
3,99,2,Up ! Go and clean the house .,prepared,0.650449
4,100,1,Clean the kitchen .,prepared,0.742187


In [3]:
X,y = dataset["uttr"], dataset["eb+_emot"]
print(X.head(5))
print(y.head(5))

0    You moron ! What fool washes diapers by the we...
1                                   You useless fool !
2                                 How dare you sleep !
3                        Up ! Go and clean the house .
4                                  Clean the kitchen .
Name: uttr, dtype: object
0       angry
1     furious
2     furious
3    prepared
4    prepared
Name: eb+_emot, dtype: object


# Preparation

In [4]:
# TODO: one-hot encode train_y and print labels distribution

In [6]:
# eventually, try to correct OCR errors
correct = False

if correct:
    X = X.apply(lambda x: spellcheck(x).fix())

In [7]:
no_test_X, test_X, no_test_y, test_y = train_test_split(X, y, test_size=0.15)
train_X, valid_X, train_y, valid_y = train_test_split(no_test_X, no_test_y, test_size=0.15)

# Single utterance classification - Baseline

In [8]:
# vectorize the text using word count
vectorizer = CountVectorizer(min_df=5, stop_words='english') #TODO: try without stopwords
vectorizer.fit(train_X)
print("Vocabulary length: " + str(len(vectorizer.get_feature_names_out())))
print("Most common words: " + str(vectorizer.get_feature_names_out()[:100]))


Vocabulary length: 49175
Most common words: ['00' '000' '001' '007' '009' '00am' '00pm' '01' '010' '015' '02' '03'
 '04' '040' '05' '0500' '06' '0600' '07' '0700' '08' '0800' '09' '0900'
 '0f' '0h' '0k' '0kay' '0n' '0nce' '0ne' '0nly' '0r' '0th' '0ur' '10'
 '100' '1000' '10000' '1001' '100m' '100s' '100th' '101' '102' '103' '104'
 '104th' '105' '106' '107' '108' '109' '10am' '10pm' '10s' '10th' '11'
 '110' '1100' '111' '112' '113' '114' '115' '116' '117' '118' '119' '11am'
 '11th' '12' '120' '1200' '121' '122' '123' '124' '125' '127' '128' '129'
 '12th' '13' '130' '1300' '131' '132' '133' '134' '135' '13th' '14' '140'
 '1400' '141' '142' '143' '145' '147']


In [9]:
train_X_vector = vectorizer.transform(train_X)
train_X_vector 

<2044260x49175 sparse matrix of type '<class 'numpy.int64'>'
	with 8825665 stored elements in Compressed Sparse Row format>

In [10]:
model = LogisticRegression(solver="sag").fit(train_X_vector, train_y)
print(model)

KeyboardInterrupt: 