<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Analysis

### Imports

In [103]:
import pandas as pd 
import numpy as np
import nltk
import re
import collections

nltk.download('stopwords')
from nltk.corpus import stopwords
from ast import literal_eval
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [104]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load Data

In [0]:
isear_data = open('/content/drive/My Drive/datasets/isear.csv')

text = []
labels = []
data = []

for line in isear_data:
  fields = line.split('|')
  text.append(fields[40])
  labels.append(fields[36])

text.pop(0)
labels.pop(0)

isear_data.close() 

data = {'Text': text, 'Emotion': labels}
df = pd.DataFrame(data)

In [106]:
df.head()

Unnamed: 0,Text,Emotion
0,"During the period of falling in love, each tim...",joy
1,When I was involved in a traffic accident.,fear
2,When I was driving home after several days of...,anger
3,When I lost the person who meant the most to me.,sadness
4,The time I knocked a deer down - the sight of ...,disgust


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7666 entries, 0 to 7665
Data columns (total 2 columns):
Text       7666 non-null object
Emotion    7666 non-null object
dtypes: object(2)
memory usage: 119.9+ KB


In [110]:
df = df[~df['Text'].str.contains('response')].reset_index(drop=True)

class_mapping = {label:idx for idx, label in enumerate(np.unique(df['Emotion']))}
print(class_mapping)

df['Emotion'] = df['Emotion'].map(class_mapping)

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Emotion'], 
                                                    test_size=0.3, random_state=42, shuffle=True)

X_train[:3]

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}


6314    A certain Area Party Secretary came home in th...
7418    I was appreciated by others, especially my fam...
151                                    Grovelling people.
Name: Text, dtype: object

### Data & Text Preprocessing

Let's have a look at the text in order to determine the necessary preprocessing steps. 

In [111]:
X_train.loc[2777]

'When you betray the trust you have been given. This is connected á with copying at a class-work at school.'

### Cleaning text data with Regular Expressions

In [0]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def preprocessor(text):
    text = text.lower()
    text = re.sub(' +', ' ', text)
    text = re.sub('á', '', text)
    text = REPLACE_BY_SPACE_RE.sub('', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    text = text.strip()
  
    return text

In [113]:
preprocessor("HELLO!!! [] this is a (:test!)")

'hello test'

### Apply the clean data preprocessor to the text

In [114]:
X_train = [preprocessor(x) for x in X_train]
X_test = [preprocessor(x) for x in X_test]

X_train[:3]

['certain area party secretary came home morning said leave home build somewhere reason wanted establish market chairman knew nothing angry',
 'appreciated others especially family members friends',
 'grovelling people']

### Finding the most popular emotions & words in the training data

In [115]:
# Dictionary of all emotions from train corpus with their counts.
emotions_counts = {}
words_count = {}

emotions_counts = collections.Counter(y_train)
words_count = collections.Counter([word for line in X_train for word in line.split(' ')])

#words_count.most_common()
emotions_counts

Counter({0: 779, 1: 778, 2: 755, 3: 752, 4: 739, 5: 750, 6: 755})

We see that the dataset is quite balanced.

## Transforming text to a vector

### Bag of words

In [0]:
DICT_SIZE = 2070

VOCAB = words_count.most_common(DICT_SIZE)
WORDS_TO_INDEX = {item[0]:ii for ii, item in enumerate(sorted(VOCAB, key=lambda x: x[1], reverse=True))}
INDEX_TO_WORDS = {ii:word for word, ii in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

def bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)

    for word in text.split(' '):
      if word in words_to_index:
        result_vector[words_to_index[word]] += 1
    return result_vector 

In [0]:
from scipy import sparse as sp_sparse

In [118]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])

print('X_train shape', X_train_mybag.shape)
print('X_test shape', X_test_mybag.shape)

X_train shape (5308, 2070)
X_test shape (2276, 2070)


## TF-IDF

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
def tfidf_features(X_train, X_test):
  tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, 
                                     token_pattern='(\S+)')
  
  X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
  X_test_tfidf = tfidf_vectorizer.transform(X_test)

  return X_train_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_

In [0]:
X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_test)
tfidf_reversed_vocab = {i:word for word, i in tfidf_vocab.items()}

In [122]:
tfidf_vocab['alcohol']

67

## Train classifier

In [0]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB

In [0]:
def train_classifier(X_train, y_train):
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    return clf

In [0]:
clf_mybag = train_classifier(X_train_mybag, y_train)

In [126]:
clf_mybag.score(X_test_mybag, y_test)

0.5505272407732865