# Sentiment Analysis

In this notebook we aim at training out-of-core algorithm by using database with opinions (in Polish) about cars - see db_cars folder.

## Data loading

In [1]:
import pandas as pd
import os
import re
from sklearn.utils import resample

basepath = './db_cars/data/'

labels = {'pos': 1, 'neg': 0}

# Opinions are imbalanced: neg/pos is approx. 6%
# We need to fetch data evenly for positive and negative opinions
# There are two options: 

# 1) Downsample majority class (positive opinions)

def fetch_data_downsample():
    df = pd.DataFrame()
    neg_numbers = {} # numbers of negative opinions in files
    for label in ['neg', 'pos']:
        path = os.path.join(basepath, label)
        for file in os.listdir(path):
            print(label, file)
            number = 0
            for line in open(os.path.join(path, file), 'r', encoding='utf-8'):
                if line != '\n': # skip empty lines
                    number += 1
                    text = re.sub('\n$', '', line) # remove end line sign
                    df = df.append([[text, labels[label]]], ignore_index=True)
                    if label == 'neg':
                        neg_numbers[file] = number
                    elif neg_numbers[file] == number:
                        break
    df.columns = ['review', 'sentiment']
    return df

# 2) Upsample minority class (negative opinions)

def fetch_data_upsample():
    df = pd.DataFrame()
    for label in ['neg', 'pos']:
        path = os.path.join(basepath, label)
        for file in os.listdir(path):
            print(label, file)
            for line in open(os.path.join(path, file), 'r', encoding='utf-8'):
                if line != '\n': # skip empty lines
                    text = re.sub('\n$', '', line) # remove end line sign
                    df = df.append([[text, labels[label]]], ignore_index=False)
    df.columns = ['review', 'sentiment']
    return upsample_minority(df)                

def upsample_minority(df):
    # Separate majority and minority classes
    df_minority = df[df.sentiment==0]
    df_majority = df[df.sentiment==1]

    # Upsample minority class
    majority_number = df['sentiment'].value_counts()[1] 
    df_minority_upsampled = resample(df_minority, 
                                     replace=True, # sample with replacement
                                     n_samples = majority_number, # to match majority class
                                     random_state=0)

    # Combine majority class with upsampled minority class
    return pd.concat([df_majority, df_minority_upsampled], ignore_index=True)

#df = fetch_data_downsample()
#db_path = './db_cars_downsampled.csv'

df = fetch_data_upsample()
db_path = './db_cars_upsampled.csv'

neg peugeot
neg kia
neg hyundai
neg mazda
neg opel
neg lancia
neg renault
neg citroen
neg volkswagen
neg ford
neg ssangyong
neg skoda
neg nissan
neg fiat
neg mitsubishi
pos peugeot
pos kia
pos hyundai
pos mazda
pos opel
pos lancia
pos renault
pos citroen
pos volkswagen
pos ford
pos ssangyong
pos skoda
pos nissan
pos fiat
pos mitsubishi


Class counts:

In [2]:
df['sentiment'].value_counts()

1    10156
0    10156
Name: sentiment, dtype: int64

Shuffling the DataFrame:

In [3]:
import numpy as np

np.random.seed(1)
df = df.reindex(np.random.permutation(df.index))

Optional: Saving the assembled data as CSV file:

In [4]:
# df.to_csv(db_path, index=False) # uncomment this !

In [5]:
import pandas as pd

df = pd.read_csv(db_path)
df.head(5)

Unnamed: 0,review,sentiment
0,"Auto udane, tanie w eksploatacji, bardzo trwał...",1
1,NIE POLECAM,0
2,"mialem takie autko peugeot 106 1,4D 1994 r , s...",1
3,1.7diesel w kombiaku wiekowy ale mnie nie zawo...,1
4,dosyć głośno wewnątrz,0


## Data processing - test

In [6]:
def get_file_content(basepath, file):
    path = os.path.join(basepath, file)
    with open(path, 'r', encoding='utf-8') as infile:
        return infile.read().split()

basepath = './processing_tools/'
stop_polish = get_file_content(basepath, 'stopwords_polish')
stop_cars = get_file_content(basepath, 'stopwords_cars')
# stop words
stop = stop_polish + stop_cars
# Polish endings
endings = get_file_content(basepath, 'endings_polish')

In [7]:
example = 'nie na 8/30, moglibysmy, oceniam na 29%. Jestem,naprawdę zadowolony i mimo, \
że już nie   chciałem kupować :p :D po45 767 raz kolejny nowego \
auta ze względu:-) na;( dużą utratę wartości, \
to Lancia bardzo sku11tecznie 100 tys km iii osładza świadomość utraty finansowej45%. :)'

polish_letters = [
    ('ą','a'), ('ć','c'), ('ę','e'), ('ł','l'), ('ń','n'), 
    ('ó','o'), ('ś','s'), ('ź','z'), ('ż','z')]

def fetch_important(text):
    # fetch emoticons
    emoticons = re.findall('[:;=]-?[()DPp]', text)
    emoticons = [e.replace('-','') for e in emoticons]
    # fetch rates (e.g. 8/10 or 100%)
    rates = re.findall('(\d+/\d+|\d+%)', text)
    return emoticons + rates

def preprocessor(text):
    # remove non-letter characters
    text = re.sub('\W+', ' ', text)
    # remove terms that contain digits
    text = re.sub('[\w]*\d+[\w]*', '', text)
    # to lower case
    text = text.lower()
    # remove Polish letters
    for (i, j) in polish_letters:
        text = re.sub(i, j, text)
    # join 'nie' with subsequent word
    text = re.sub('(^|\s)(nie)\s+', ' nie', text)
    return text

print(preprocessor(example))

 niena   moglibysmy oceniam na  jestem naprawde zadowolony i mimo ze juz niechcialem kupowac p d   raz kolejny nowego auta ze wzgledu na duza utrate wartosci to lancia bardzo   tys km iii osladza swiadomosc utraty  


In [8]:
def remove_endings(word):
    for ending in endings:
        word = re.sub(ending+'$','', word)
    return word

def tokenizer(text):
    # fetch important tokens (emoticons and rates)
    important = fetch_important(text)
    # clean text
    processed = preprocessor(text)
    # remove irrelevant words (one-letter, Polish, car-specific)
    words = [w for w in processed.split() if len(w) > 1 and w not in stop]
    # remove Polish endings
    tokens = [remove_endings(w) for w in words]
    return tokens + important

In [9]:
example

'nie na 8/30, moglibysmy, oceniam na 29%. Jestem,naprawdę zadowolony i mimo, że już nie   chciałem kupować :p :D po45 767 raz kolejny nowego auta ze względu:-) na;( dużą utratę wartości, to Lancia bardzo sku11tecznie 100 tys km iii osładza świadomość utraty finansowej45%. :)'

In [10]:
print(tokenizer(example))

['niena', 'mogli', 'oceniam', 'naprawde', 'zadowolony', 'mimo', 'juz', 'niechcialem', 'kupowac', 'kolejny', 'nowego', 'auta', 'wzgledu', 'duza', 'utrate', 'wartosci', 'lancia', 'bardzo', 'osladza', 'swiadomosc', 'utraty', ':p', ':D', ':)', ';(', ':)', '8/30', '29%', '45%']


## Out-of-core learning

In [11]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [12]:
db_example = next(stream_docs(path = db_path))
print(db_example)

('"Auto udane, tanie w eksploatacji, bardzo trwały, tanie części zamienne"', 1)


In [13]:
print(tokenizer(db_example[0]))

['auto', 'udane', 'tanie', 'eksploatacji', 'bardzo', 'trwaly', 'tanie', 'czesci', 'zamienne']


In [14]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [15]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path= db_path)

In [16]:
classes = np.array([0, 1])
for _ in range(9):
    X_train, y_train = get_minibatch(doc_stream, size=2000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

In [17]:
X_test, y_test = get_minibatch(doc_stream, size=2312)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.913


In [18]:
clf = clf.partial_fit(X_test, y_test)