# Cleaning Train Data

In [151]:
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
import sys
from random import shuffle

In [7]:
positive_file = 'data/train_data_raw/kitchen_housewares/reviews_positive.xml'
negative_file = 'data/train_data_raw/kitchen_housewares/reviews_negative.xml'

In [8]:
reviews_list = []

In [9]:
with open(positive_file, 'r') as pos_file:
    positive_xml_string = pos_file.read()

pos_file.close()

with open(negative_file, 'r') as neg_file:
    negative_xml_string = neg_file.read()

neg_file.close()

In [203]:
pos_parser = etree.XMLParser(encoding="UTF-8", recover = True)
neg_parser = etree.XMLParser(encoding="UTF-8", recover = True)

In [204]:
positive_root = etree.fromstring(positive_xml_string, parser=pos_parser)
negative_root = etree.fromstring(negative_xml_string, parser=neg_parser)

In [205]:
positive_reviews = positive_root.findall('review')
negative_reviews = negative_root.findall('review')

In [206]:
def helpful_conv(text):
    try:
        new_text = text.replace('of', '|')
        scores = new_text.split('|')
        helpful_score = (float(scores[0].replace("\n", "").replace("\t", "").replace(" ", "")) / float(scores[1].replace("\n", "").replace("\t", "").replace(" ", "")))
        helpful_score *= 100
        return int(helpful_score)
    except:
        return 0

In [207]:
def add_to_review_dict(review, sentiment):

    count = 0
    
    while count < (len(review)-1):
        try:
            rev = review[count + 1]
            uniq_id = rev[0].text.strip()
            product_name = rev[2].text.strip()
            helpful = helpful_conv(rev[4].text)
            rating = int(float(rev[5].text.replace("\n", "").replace("\t", "").replace(" ", "")))
            summary = rev[6].text.strip()
            review_text = rev[10].text.strip()
            reviewer = rev[8].text.strip()
            sentiment = sentiment

            reviews_list.append([uniq_id, product_name, summary, review_text, reviewer, helpful, rating, sentiment])

        except:
            print("Unexpected error:", sys.exc_info()[0])
            continue
            
        count += 1

In [209]:
add_to_review_dict(positive_reviews, "1")
add_to_review_dict(negative_reviews, "0")

In [210]:
len(reviews_list)
shuffle(reviews_list)

In [211]:
dataframe = pd.DataFrame(reviews_list, columns = ['uniq_id', 'product_name', 'summary', 'review_text', 'reviewer', 'helpful', 'rating', 'sentiment'])

In [212]:
dataframe.head(4)

Unnamed: 0,uniq_id,product_name,summary,review_text,reviewer,helpful,rating,sentiment
0,B000CPZXGO:disappointing:dave_in_kansas,Cuisinart GR-1 Griddler Panini and Sandwich Pr...,Disappointing,This press doesn't seem to work well. The top...,Dave in Kansas,100,1,0
1,B0000VCXR2:too_damned_purple!_(aubergine):j._s...,"Chantal 48-Ounce Tea Steep, White: Kitchen & H...",Too damned purple! (Aubergine),"This is NOT Aubergine, a dark purple like the ...","J. Stein ""Tech Buddy""",33,1,0
2,B000CDHOKY:need_to_iron_it:s._patel_garfield,250-Thread-Count 100% Cotton Sateen Multi Stri...,Need to iron it,"Good for the price, I wish it didnt get crumbl...",S. Patel Garfield,0,4,1
3,B0000DI5G7:it_is_really_great_product:haranadh...,CorningWare French White 12-Piece Gift Bake an...,It is really great product,I like the amazon customer service as they rep...,"Haranadh B. Kanumuri ""Babu""",100,4,1


In [213]:
dataframe.to_csv(r'data/full_data/clean_data_kitchen_housewares.csv')

# Cleaning Test Data

In [214]:
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
import sys
from random import shuffle

In [275]:
category = "electronics"

In [276]:
file_name = 'data/test_data_raw/'+category+'.xml'

In [277]:
with open(file_name, 'r') as file_data:
    data_xml_string = file_data.read()
file_data.close()

In [278]:
xmltree_parser = etree.XMLParser(encoding = "UTF-8", recover = True)

In [279]:
data_root = etree.fromstring(data_xml_string, parser = xmltree_parser)

In [280]:
review_data = data_root.findall('review')

In [281]:
def helpful_conversion(text):
    try:
        new_text = text.replace('of', '|')
        scores = new_text.split('|')
        helpful_score = (float(scores[0].replace("\n", "").replace("\t", "").replace(" ", "")) / float(scores[1].replace("\n", "").replace("\t", "").replace(" ", "")))
        helpful_score *= 100
        return int(helpful_score)
    except:
        return 0

In [282]:
reviews = []

In [283]:
def add_to_review_dict(review):

    count = 0
    
    while count < 100:
        try:
            rev = review[count + 1]
            uniq_id = rev[0].text.strip()
            product_name = rev[2].text.strip()
            helpful = helpful_conv(rev[4].text)
            summary = rev[6].text.strip()
            review_text = rev[10].text.strip()
            reviewer = rev[8].text.strip()

            reviews.append([uniq_id, product_name, summary, review_text, reviewer, helpful])

        except:
            print("Unexpected error:", sys.exc_info()[0])
            continue
            
        count += 1

In [284]:
add_to_review_dict(review_data)

In [285]:
shuffle(reviews)
dataframe = pd.DataFrame(reviews, columns = ['uniq_id', 'product_name', 'summary', 'review_text', 'reviewer', 'helpful'])
dataframe.to_csv(r'data/test_data/test_'+category+'.csv')

In [286]:
test_review = []

In [287]:
def test_data_results(review):

    count = 0
    
    while count < 100:
        try:
            rev = review[count + 1]
            uniq_id = rev[0].text.strip()
            rating = int(float(rev[5].text.replace("\n", "").replace("\t", "").replace(" ", "")))

            test_review.append([uniq_id, rating])

        except:
            print("Unexpected error:", sys.exc_info()[0])
            continue
            
        count += 1

In [288]:
test_data_results(review_data)

In [289]:
shuffle(test_review)
dataframe = pd.DataFrame(test_review, columns = ['uniq_id', 'rating'])
dataframe.to_csv(r'data/solution_data/'+category+'.csv')