# Train and Test data creation

#### We are using data for resturant domain from SemEval 2014 competition

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
from collections import Counter, defaultdict
import re

pd.set_option('display.max_colwidth' , -1)

In [2]:
import xml.etree.ElementTree as ET
tree = ET.parse('../data/Restaurants_Train.xml')
root = tree.getroot()

In [3]:
labeled_reviews = []
for sentence in root.findall("sentence"):
    entry= {}
    aspects = []
    polarities = [] 

    if sentence.find("aspectCategories"):

        for aspect in sentence.find("aspectCategories").findall("aspectCategory"):
  
            aspects.append(aspect.get("category"))
            polarities.append(aspect.get("polarity"))
    
    entry["text"] = sentence[0].text
    entry["aspects"] , entry['polarities']  =  aspects , polarities
    labeled_reviews.append(entry)
            
multi_aspects = pd.DataFrame(labeled_reviews)
print("there are",len(labeled_reviews),"reviews in this training set")

there are 3044 reviews in this training set


In [4]:
multi_aspects.head()

Unnamed: 0,aspects,polarities,text
0,[service],[negative],But the staff was so horrible to us.
1,"[food, anecdotes/miscellaneous]","[positive, negative]","To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora."
2,[food],[positive],"The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not."
3,[service],[positive],Where Gabriela personaly greets you and recommends you what to eat.
4,[anecdotes/miscellaneous],[positive],"For those that go once and don't enjoy it, all I can say is that they just don't get it."


#### Each review can have multiple aspects and sentiment. Create a stratified train test split based upon sentiment conflicts

For test data we want to evaluate our model on statements with changing sentiment across same review. Therefore for non-changing reviews( review with same sentiment for all occuring aspects) we split train:test in ratio 80:20 . Whereas for changing reviews(changing sentiments across various aspects within same review) we split train: test in ratio 20:80. 

In [5]:
multi_aspects['length'] = multi_aspects['aspects'].apply(lambda x: len(x))
multi_aspects['same_polarity'] = multi_aspects['polarities'].apply(lambda x: len(set(x)))
multi_aspects['polarities'] = multi_aspects['polarities'].apply(lambda x: ' '.join(x))
multi_aspects['aspects'] = multi_aspects['aspects'].apply(lambda x: ' '.join(x))
multi_aspects = multi_aspects[~multi_aspects['polarities'].str.contains('conflict')]

We are also keeping an indicator for reviews with changing sentiments within single review. Remember with changing sentiment we are focusing on polarities such as [positive negative], [negative neutral], [negative positive positive]  etc. 
We have already dropped the reviews which contains polarities as [conflict] from our dataset.

In [6]:
multi_aspects['ind'] = multi_aspects['same_polarity'].apply(lambda x: 1 if  x> 1 else 0 )

In [7]:
from sklearn.model_selection import train_test_split
train1 , test1  = train_test_split(multi_aspects[multi_aspects['same_polarity']==1], test_size = 0.2 , random_state = 20)
test2 , train2  = train_test_split(multi_aspects[multi_aspects['same_polarity']!=1] , test_size = 0.2 , random_state = 20)

train_df = pd.concat([train1 , train2] , axis = 0)
test_df = pd.concat([test1 , test2] , axis = 0)

train_df.drop(columns = ['same_polarity'] , inplace = True)
test_df.drop(columns = ['same_polarity'] , inplace = True)

train_df.reset_index(inplace = True , drop = True)
test_df.reset_index(inplace = True , drop = True)

train_df.shape , test_df.shape

((2206, 5), (649, 5))

In [8]:
def replace_brackets(string):
    string = string.replace('(' , ' ')
    string = string.replace(')' , ' ')
    string = string.replace('[' , ' ')
    string = string.replace(']' , ' ')
    string = string.replace('$' , 'dollar ')
    return string

In [9]:
test_df['text'] = test_df['text'].apply(lambda x: replace_brackets(x))
train_df['text'] = train_df['text'].apply(lambda x: replace_brackets(x))

In [10]:
train_df.ind.value_counts(normalize = True)

0    0.988214
1    0.011786
Name: ind, dtype: float64

In [11]:
test_df.ind.value_counts(normalize = True)

0    0.841294
1    0.158706
Name: ind, dtype: float64

Because of unbalanced splits of train and test data , we have only 1 % training data for statements with change in sentiment but around 15% test data in same category

In [12]:
test_df['aspects']  = test_df.aspects.apply(lambda x : x.split(' '))
test_df['polarities']  = test_df.polarities.apply(lambda x : x.split(' '))

In [13]:
asp_df = test_df.aspects.apply(pd.Series).merge(test_df , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities'] ,axis = 1)\
.melt(id_vars = ['text','length' , 'ind']).drop(['variable'] , axis = 1).dropna()

polarity_df = test_df.polarities.apply(pd.Series).merge(test_df , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities'] ,axis = 1)\
.melt(id_vars = ['text','length' , 'ind']).drop(['variable'] , axis = 1).dropna()

test_df_ungrp = pd.merge(asp_df , polarity_df[ 'value'] , left_index=True , right_index = True, suffixes=('_aspects' , '_polarities'))
test_df_ungrp.shape

(868, 5)

In [14]:
test_df_ungrp.rename(columns={'value_aspects' : 'aspects' , 'value_polarities':'polarities'} , inplace=True)

In [15]:
train_df['aspects']  = train_df.aspects.apply(lambda x : x.split(' '))
train_df['polarities']  = train_df.polarities.apply(lambda x : x.split(' '))

In [16]:
asp_df = train_df.aspects.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities'] ,axis = 1)\
.melt(id_vars = ['text','length' , 'ind']).drop(['variable'] , axis = 1).dropna()

polarity_df = train_df.polarities.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities'] ,axis = 1)\
.melt(id_vars = ['text','length' , 'ind']).drop(['variable'] , axis = 1).dropna()

train_df_ungrp = pd.merge(asp_df , polarity_df['value'] , left_index = True , right_index = True ,suffixes=('_aspects' , '_polarities'))
train_df_ungrp.rename(columns={'value_aspects' : 'aspects' , 'value_polarities':'polarities'} , inplace=True)
train_df_ungrp.shape

(2604, 5)

In [17]:
test_df_ungrp.to_csv('../data/resturant_test_stratified.csv' , index = False)
train_df_ungrp.to_csv('../data/resturant_train_stratified.csv' , index = False)

test_df.to_csv('../data/resturant_test_stratified_grouped.csv' , index = False )
train_df.to_csv('../data/resturant_train_stratified_grouped.csv' , index = False )