# Sentiment Analysis

In this notebook we aim at training out-of-core algorithm by using database with opinions (in Polish) about cars - see db_cars folder.

## Data loading

In [1]:
import pandas as pd
import os
import re
basepath = './db_cars/data/'

df = pd.DataFrame()
labels = {'pos': 1, 'neg': 0}

# Fetch all data

# for l in ('pos', 'neg'):
#     path = os.path.join(basepath, l)
#     for file in os.listdir(path):
#         print(l, file)
#         for line in open(os.path.join(path, file), 'r', encoding='utf-8'):
#             if line != '\n': # skip empty lines
#                 text = re.sub('\n$', '', line) # remove end line sign
#                 df = df.append([[text, labels[l]]], ignore_index=True)

# Fetch data evenly for positive and negative opinions
# Opinions are imbalanced: neg/pos is approx. 5%

neg_numbers = {}

def fetch_data(label, neg_numbers, df):
    path = os.path.join(basepath, label)
    for file in os.listdir(path):
        print(label, file)
        number = 0
        for line in open(os.path.join(path, file), 'r', encoding='utf-8'):
            if line != '\n': # skip empty lines
                number += 1
                text = re.sub('\n$', '', line) # remove end line sign
                df = df.append([[text, labels[label]]], ignore_index=True)
                if label == 'neg':
                    neg_numbers[file] = number
                elif neg_numbers[file] == number:
                    break
    return neg_numbers, df

for label in ['neg', 'pos']:
    neg_numbers, df = fetch_data(label, neg_numbers, df)

df.columns = ['review', 'sentiment']

neg renault
neg peugeot
neg lancia
neg mazda
neg hyundai
neg ssangyong
neg skoda
neg nissan
neg kia
neg mitsubishi
neg fiat
neg volkswagen
neg opel
neg citroen
neg ford
pos renault
pos peugeot
pos lancia
pos mazda
pos hyundai
pos ssangyong
pos skoda
pos nissan
pos kia
pos mitsubishi
pos fiat
pos volkswagen
pos opel
pos citroen
pos ford


Cross-check

In [2]:
print(len(df), 2*sum(n for n in neg_numbers.values()))

1180 1180


Shuffling the DataFrame:

In [19]:
import numpy as np

np.random.seed(123)
df = df.reindex(np.random.permutation(df.index))

Optional: Saving the assembled data as CSV file:

In [20]:
df.to_csv('./db_cars.csv', index=False) # uncomment this !

In [21]:
import pandas as pd

df = pd.read_csv('./db_cars.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,jak najbardziej polecam,1
1,Polecam,1
2,"Peugeot to słaby ,,Partner"" gdybym miał wybrać...",0
3,.,1
4,jestem zadowolony z autka,1


## Data processing - test

In [22]:
def get_file_content(basepath, file):
    path = os.path.join(basepath, file)
    with open(path, 'r', encoding='utf-8') as infile:
        return infile.read().split()

basepath = './processing_tools/'
stop_polish = get_file_content(basepath, 'stopwords_polish')
stop_cars = get_file_content(basepath, 'stopwords_cars')
# stop words
stop = stop_polish + stop_cars
# Polish endings
endings = get_file_content(basepath, 'endings_polish')

In [23]:
example = 'nie na 8/30, moglibysmy, oceniam na 29%. Jestem,naprawdę zadowolony i mimo, \
że już nie   chciałem kupować :p :D po45 767 raz kolejny nowego \
auta ze względu:-) na;( dużą utratę wartości, \
to Lancia bardzo sku11tecznie 100 tys km iii osładza świadomość utraty finansowej45%. :)'

polish_letters = [
    ('ą','a'), ('ć','c'), ('ę','e'), ('ł','l'), ('ń','n'), 
    ('ó','o'), ('ś','s'), ('ź','z'), ('ż','z')]

def fetch_important(text):
    # fetch emoticons
    emoticons = re.findall('[:;=]-?[()DPp]', text)
    emoticons = [e.replace('-','') for e in emoticons]
    # fetch rates (e.g. 8/10 or 100%)
    rates = re.findall('(\d+/\d+|\d+%)', text)
    return emoticons + rates

def preprocessor(text):
    # remove non-letter characters
    text = re.sub('\W+', ' ', text)
    # remove terms that contain digits
    text = re.sub('[\w]*\d+[\w]*', '', text)
    # to lower case
    text = text.lower()
    # remove Polish letters
    for (i, j) in polish_letters:
        text = re.sub(i, j, text)
    # join 'nie' with subsequent word
    text = re.sub('(^|\s)(nie)\s+', ' nie', text)
    return text

print(preprocessor(example))

 niena   moglibysmy oceniam na  jestem naprawde zadowolony i mimo ze juz niechcialem kupowac p d   raz kolejny nowego auta ze wzgledu na duza utrate wartosci to lancia bardzo   tys km iii osladza swiadomosc utraty  


In [24]:
def remove_endings(word):
    for ending in endings:
        word = re.sub(ending+'$','', word)
    return word

def tokenizer(text):
    # fetch important tokens (emoticons and rates)
    important = fetch_important(text)
    # clean text
    processed = preprocessor(text)
    # remove irrelevant words (one-letter, Polish, car-specific)
    words = [w for w in processed.split() if len(w) > 1 and w not in stop]
    # remove Polish endings
    tokens = [remove_endings(w) for w in words]
    return tokens + important

In [25]:
example

'nie na 8/30, moglibysmy, oceniam na 29%. Jestem,naprawdę zadowolony i mimo, że już nie   chciałem kupować :p :D po45 767 raz kolejny nowego auta ze względu:-) na;( dużą utratę wartości, to Lancia bardzo sku11tecznie 100 tys km iii osładza świadomość utraty finansowej45%. :)'

In [26]:
print(tokenizer(example))

['niena', 'mogli', 'oceniam', 'naprawde', 'zadowolony', 'mimo', 'juz', 'niechcialem', 'kupowac', 'kolejny', 'nowego', 'auta', 'wzgledu', 'duza', 'utrate', 'wartosci', 'lancia', 'bardzo', 'osladza', 'swiadomosc', 'utraty', ':p', ':D', ':)', ';(', ':)', '8/30', '29%', '45%']


## Out-of-core learning

In [27]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [28]:
db_example = next(stream_docs(path='./db_cars.csv'))
print(db_example)

('jak najbardziej polecam', 1)


In [29]:
print(tokenizer(db_example[0]))

['jak', 'najbardziej', 'polecam']


In [30]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [31]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='./db_cars.csv')

In [32]:
classes = np.array([0, 1])
for _ in range(11):
    X_train, y_train = get_minibatch(doc_stream, size=100)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

In [33]:
X_test, y_test = get_minibatch(doc_stream, size=80)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.750


In [34]:
clf = clf.partial_fit(X_test, y_test)