In [339]:
import gzip, json, pandas as pd, re, numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import string, unidecode, html
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
from scipy.stats.stats import pearsonr

In [177]:
input = '../Data/reviews_Cell_Phones_and_Accessories_5.json.gz'
data = []
with gzip.open(input) as f:
    for l in f:
        data.append(json.loads(l.strip()))

data = pd.DataFrame.from_dict(data)
df = data[['reviewText', 'overall']]

In [3]:
df.head()

Unnamed: 0,reviewText,overall
0,They look good and stick good! I just don't li...,4.0
1,These stickers work like the review says they ...,5.0
2,These are awesome and make my phone look so st...,5.0
3,Item arrived in great time and was in perfect ...,4.0
4,"awesome! stays on, and looks great. can be use...",5.0


In [10]:
df.shape

# filtern nach anzahl der Wörter müssen mehr als 5 sein, damit es als eine valide Review
# + gleiche reviewtexte auch rausfiltern

(194439, 2)

In [286]:
# emoticons
# dirty = df[df['reviewText'].str.contains('\:\-\)')]
# dirty = df[df['reviewText'].str.contains('.-')]
# dirty = df[df['reviewText'].str.contains('\/n')]
# dirty = df[df['reviewText'].str.contains('&#8217')] #, e.g. 757 - done
# dirty = df[df['reviewText'].str.contains('00')] #, e.g. $20.00 - done
# dirty = df[df['reviewText'].str.contains('couldn\'t')]
dirty = df[df['reviewText'].str.contains('@')]
# dirty = df[df['reviewText'].str.contains('JUNK')]
# dirty = df[df['reviewText'].str.contains('u"hello world')]
print(dirty)

                                               reviewText  overall
656     I bought a similar one @ a local store & was s...      5.0
834     My previous two PalmOS phones were a Kyocera 6...      4.0
855     I bought this item @ a local retailer, was hap...      1.0
1298    Update 11/11/2008My wife's died about two week...      5.0
2000    First, the ear clip snapped around the clip th...      2.0
...                                                   ...      ...
190717  Seemed to work well for a while (6 weeks or so...      5.0
191490  Companies are definitely stepping up to the pl...      5.0
193426  Lepow battery banks always impress me with the...      5.0
193748  A review of the "Bolse 25W (5V/5A) 4-Port USB ...      5.0
194298  In a crowded market of multiport USB chargers ...      5.0

[346 rows x 2 columns]


In [257]:
df.loc[1178]['reviewText']

"I have been itching for a Treo ever since the 600 came out and I noticed that more streamlined design.  When smartphones were first introduced, I bucked the trend of combining the two thinking that as often as I use my cell phone and my pda, if I bought one that integrated both I would wear it out twice as fast.  And in my humble opinion, it was a valid reason because back then, the original phone/pda's were impossibly large.  Well I have finally become a convert.I spent a great deal of time researching Blackberry's vs. the Treo's vs. Pocket PC's and came VERY close to grabbing a T-Mobile Dash (which I still believe is the all-around smallest model availabe as I write this, or at least it SEEMED to be), but I finally settled with the trusted Palm OS platform (one that I have become VERY familiar with over the years using pda's for quite some time now).  When I set out to get one, I really had my heart set on the Treo 680...but as I read the reviews for them, one thing kept creeping in

In [None]:
stops = stopwords.words('english')
stops.append('I')
stops = [e for e in stops if e not in ('but', 'no', 'not')]

In [288]:
# Preprocessing
def strip_html(t):
    soup = BeautifulSoup(t, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(t):
    return re.sub('\[[^]]*\]', '', t)

def remove_between_angle_brackets(t):
    return re.sub(r'<[^>]+>', '', t)

def remove_accented_chars(t):
    return unidecode.unidecode(t)

def replace_contractions(t):
    return contractions.fix(t)

def remove_numbers(t):
    return re.sub(r'([a-zA-Z]*[0-9])|(\S*[0-9]\S)|([0-9])', '', t)

def remove_weblinks(t):
    return re.sub(r'http\S+', '', t)

def unescape(t):
    return html.unescape(t)

def remove_punctuation(t):
    return re.sub(r'([^\w!)])\1{1,}|(_{2,})|([^\w\s])', ' ', t)

def remove_extra_whitespaces(t):
    return re.sub(r'[\s]{2,}', ' ', t)

# experiment what is better 1. lowercase all 2. only lowercase if all caps
def to_lowercase(t):
    words = nltk.word_tokenize(t)
    new_words = []
    # for word in words:
    #     if word.isupper():
    #         new_words.append(word)
    #     else:
    #         new_word = word.lower()
    #         new_words.append(new_word)
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    new_review = TreebankWordDetokenizer().detokenize(new_words)
    return new_review

def remove_stopwords(t):
    words = nltk.word_tokenize(t)
    new_words = []
    for word in words:
        if word not in stops:
            new_words.append(word)
    new_review = TreebankWordDetokenizer().detokenize(new_words)
    return new_review

def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a',
                "N": 'n',
                "V": 'v',
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

def preprocess_reviews(reviews):
    ## Denoise
    # - remove unnecessary space and <br>, HTML tags
    reviews = [strip_html(line) for line in reviews]
    reviews = [remove_between_square_brackets(line) for line in reviews]
    reviews = [remove_between_angle_brackets(line) for line in reviews]

    # remove weblinks
    reviews = [remove_weblinks(line) for line in reviews]

    # - spacing after .,-
    # reviews = [re.sub(r'(?<=[.,-])(?=[^\s])', r' ', line) for line in reviews]

    # - standardising of lettering, e.g. cafe instead of café
    reviews = [unidecode.unidecode(line) for line in reviews]
    reviews = [unescape(line) for line in reviews]

    # Expand contractions
    reviews = [replace_contractions(line) for line in reviews]

    # remove numbers and connected number units,
    reviews = [remove_numbers(line) for line in reviews]

    # remove multiple special characters expect !
    reviews = [remove_punctuation(line) for line in reviews]

    # lowercase, except all caps
    reviews = [to_lowercase(line) for line in reviews]

    reviews = [lemmatize_with_postag(line) for line in reviews]

    # stopword removal
    reviews = [remove_stopwords(line) for line in reviews]

    # remove multiple white spaces
    reviews = [remove_extra_whitespaces(line) for line in reviews]
    return reviews

reviews_clean = preprocess_reviews(df['reviewText'])
# - tokenize into single words or n-grams (more than 3 times frequency)
# - additional domain specific stopword list

# print(vectorizer.get_stop_words())
# stop_words = ["i", "me", "you"]
# final_words = []
# for word in vectorizer.get_feature_names():
#     if word not in stop_words:
#         final_words.append(word)

In [289]:
[k for k in reviews_clean if ' he ' in k]
# reviews_clean[82]

['connect also jabra simple version disconnect device complaint except connect variety beep tell go sometimes beep somewhat confusing boo beep mean disconnect phone connect disconnected power signal voice prompt connect remove ambiguity good not good stream music disconnect interrupt music copy least annoying tendency make connected break connection phone time settle but _does_ settle eventually problem go away otherwise would star unacceptable speakerphone edit well no not give much hassle connect ipod turn take short drive today phone turn _never_ stop alternate connect disconnect think happen eventually start ignore phone entirely but show bluetooth connect ipod still stream music maybe copy receive never similar problem jabra use phone would think two product company would equally effective make keep bluetooth connection connect two device simultaneously advertised mention though connect ipod touch turn painful initial pairing work but connecting not eventually find turn bluetooth 

In [276]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a',
                "N": 'n',
                "V": 'v',
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

# Lemmatize
sentence='first, case looks ok. fairly easy open, etc . good using extended battery but not really real protection extended battery drop . mean sure, covered black thin rubber outer housing but it.the little locking tabs . tabs break easily especially open close inner case many times (take phone) proximity sensor - yes, confirmed due design, interfere proximity sensor . tested not face plate, making call putting phone close head still see screen go black.i tested making call barley moving front cover over\\near phone sensor activates shuts screen remove faceplate.the issue, front top area face plate . pictures, camera, sensors ear speaker, shows longish round area . one received like rectangle type shape . kind looks like boat shape . alone might sensor not working correctly.since item twenty dollars, rather trying modify case, returning it.over, think ok case not really think bad . could serve purpose . would not try putting drop test . even upon calling seller purchased, told not even drop test item selling it.i test front rear camera flash nothing case interferes camera\'s.i returning item seller agreed take item back refund purchase price including shipping costs . update later process.so, , not going bash seller item people may really like may work . not work . would given starts face plate not interfere proximity sensors area front face plate matched stock photo . issue, stock photo, round\\ovalish . one received boat shape.the give basic phone . case not interfere rear front camera\'s belt holster seems pretty sturdy strong . not seem like phone case easily pop not really difficult remove housing simply follow accommodate use extended screen protector works fine . received``tight screen". not find issue . find near bottom little airy like but no issues using touch screen no problems one stick screen protectors phone use conjunction faceplate screen protector . used faceplate screen protector clear bother hd picture display seller quick reply email, concerns request return front face plate interfere proximity locking tabs seem like sure break easily . therefore not good idea constantly open close\\take phone no real padding\\protection extended battery except outer thin rubber housing not easily remove screen protector glued inside faceplate . would cut screen protector faceplate not want use . therefore voiding warranty.'
lemmatize_with_postag(sentence)

"first case look ok. fairly easy open etc good use extended battery but not really real protection extend battery drop mean sure cover black thin rubber outer housing but it.the little locking tab tab break easily especially open close inner case many time take phone proximity sensor yes confirm due design interfere proximity sensor test not face plate make call put phone close head still see screen go black.i test make call barley move front cover over\\near phone sensor activate shuts screen remove faceplate.the issue front top area face plate picture camera sensor ear speaker show longish round area one receive like rectangle type shape kind look like boat shape alone might sensor not work correctly.since item twenty dollar rather try modify case return it.over think ok case not really think bad could serve purpose would not try put drop test even upon call seller purchase tell not even drop test item sell it.i test front rear camera flash nothing case interfere camera's.i return it

In [290]:
# Feature Generation & Selection
# ignore too infrequent words, e.g. vectorizer = CountVectorizer(min_df=30)
# POS  only adjectives and nouns and verbs?

# corpus = df.loc[:5]['reviewText']
# corpus = df['reviewText']    # das funktioniert nicht aufgrund des zu großen Arraya
corpus = reviews_clean
# corpus = data[data['overall']==1]['reviewText']
# print(corpus)

In [336]:
# vectorizer = CountVectorizer(ngram_range=(1,2), min_df=100)

vectorizer = CountVectorizer(ngram_range=(1,3), min_df=30)
X = vectorizer.fit_transform(corpus)
vocab = vectorizer.get_feature_names()
print(len(vocab))
# 43792 at min_df =50
# 24484 at min_df =100

52528


In [338]:
matching = [s for s in vocab if "good" in s]
print(matching)

['actually good', 'actually pretty good', 'adapter good', 'add good', 'allow good', 'almost good', 'already good', 'also good', 'also look good', 'although good', 'always good', 'amazon good', 'android good', 'another good', 'anyone look good', 'anything good', 'appear good', 'area good', 'around good', 'arrive good', 'arrive time good', 'ask good', 'audio quality good', 'away good', 'back good', 'bad good', 'base good', 'bass good', 'battery good', 'battery life good', 'battery not good', 'battery pack good', 'battery work good', 'believe good', 'big good', 'bit good', 'bluetooth good', 'box good', 'break good', 'build good', 'build quality good', 'bulky good', 'but could good', 'but far good', 'but get good', 'but good', 'but good case', 'but good enough', 'but good price', 'but good product', 'but good quality', 'but good thing', 'but look good', 'but much good', 'but not good', 'but overall good', 'but pretty good', 'but price good', 'but really good', 'but seem good', 'but sound g

In [337]:
print(vocab[:10])

['aa', 'aa battery', 'aaa', 'aaa battery', 'ab', 'abandon', 'abco', 'ability', 'ability change', 'ability charge']


In [38]:
# MemoryError
X.shape
content = X.toarray()
print(content)
print(type(X))
df = pd.DataFrame(data= X.toarray(), columns=vocab)


(194439, 3506)

In [None]:
uni_vect = CountVectorizer(analyzer='word')
bi_vect = CountVectorizer(ngram_range=(2,2))
tri_vect = CountVectorizer(ngram_range=(3,3))
dict = {}

class Switcher(object):
    def vec(self,i):
        method_name = 'vec_' + str(i)
        method = getattr(self,method_name,lambda :'Invalid')
        return method()
    def vec_1(self):
        uni_vect.fit_transform(e)
        X = uni_vect.transform(corpus)
        return X
    def vec_2(self):
        bi_vect.fit_transform(e)
        X = bi_vect.transform(corpus)
        return X
    def vec_3(self):
        tri_vect.fit_transform(e)
        X = tri_vect.transform(corpus)
        return X
s = Switcher()
# prob = vocab[:10]
for e in vocab:
    length = len(e.split())
    e = [e]
    X = s.vec(length)
    list_of_lists = X.toarray().tolist()
    c = [val for sublist in list_of_lists for val in sublist]
    correlation, p_value = pearsonr(c, l)
    # print(e, " : ", str(correlation))
    dict[e[0]] = correlation

In [None]:
{k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}

In [None]:
amount_of_features = 5000
sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}
els = list(sorted_dict.items())
new_vocab = list(sorted_dict.keys())[-amount_of_features:]
print(new_vocab)

In [None]:
# 10 Features with lowest Correlation to the Label
els[10:]

In [None]:
# 10 Features with highest Correlation to the Label
els[-10:]


In [245]:
from scipy.stats import pearsonr
np.corrcoef(c,l)[1,0]



0.01847548316904673

In [None]:
# eliminate all the words which are not part of the new vocab.
# or use the new vocab
## order by frequency?

In [20]:
print(df)

00           False
00 for       False
000          False
000 mah      False
000mah       False
             ...  
zero         False
zerolemon    False
zip          False
zone         False
zoom         False
Length: 24484, dtype: bool


In [None]:
df['char_count'] = df['reviewText'].apply(lambda review: len(review))
vectorizer = CountVectorizer()
vectorizer.fit(df['reviewText'])
df['word_vec'] = df['reviewText'].apply(lambda review: vectorizer.transform(review))

In [52]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'very', "isn't", 'about', "wasn't", 'you', 'were', 'does', 'i', 'theirs', 'while', 'then', 'has', 'nor', 'our', "don't", 'from', 'in', 'herself', 'haven', "that'll", 'how', 'them', "didn't", 'of', "should've", 'me', "hadn't", 'being', 'but', 'this', 'she', 'my', 't', 'hers', 'into', "mustn't", 'aren', 'will', 'any', 'a', 'hadn', 'shan', "you'll", "it's", 'after', 'too', 'his', 'down', 'do', "weren't", 'most', 'before', 'won', 'off', 'have', 'weren', 'o', 're', 'and', 'same', 'mustn', 'wouldn', "she's", 'which', "couldn't", 'under', 'hasn', "you're", 'only', "hasn't", 'll', 've', 'yourselves', 'now', 'again', 'doing', 'own', 'ma', "won't", 'through', 'such', 'just', 'it', 'needn', 'out', "needn't", 'him', 'or', 'why', 'few', 'y', 'above', 'had', 'did', 'at', 'shouldn', 'himself', 'for', 'both', 'those', "you've", 'don', 'whom', "shan't", "wouldn't", 'was', 'where', "shouldn't", "haven't", 'if', 'no', 'mightn', 'these', 'they', 'is', 'what', 'between', "you'd", 'am', 'because', 'that', 

In [None]:
df.loc[2]['word_vec']

In [None]:
ratings = ['1.0', '2.0', '3.0', '4.0', '5.0']
for r in ratings:
    print(r)

In [None]:
# Oversampling the text and target and put them into a new df
# RandomOvesampler und UNdersampler do only work in feature spaces -> manually composing the undersampling
# Unterteilen in Klassen -> von jeder Klasse nur 90000 random
# data.describe()
df_1 = data[data['overall']==1.0].values.tolist()
df_2 = data[data['overall']==2.0].values.tolist()
df_3 = data[data['overall']==3.0].values.tolist()
df_4 = data[data['overall']==4.0].values.tolist()
df_5 = data[data['overall']==5.0].values.tolist()
print(len(df_1),len(df_2), len(df_3), len(df_4), len(df_5))
print(type(df_1))
# data = random.sample(data, 200000)

In [64]:
import random
df1 = random.sample(df_1, 90000)
df2 = random.sample(df_2, 90000)
df3 = random.sample(df_3, 90000)
df4 = random.sample(df_4, 90000)
df5 = random.sample(df_5, 90000)
anh = pd.DataFrame.from_dict(df1)

In [71]:
anh.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,A2FL0EKQ2EQ9E6,0790743221,"a viewer ""a viewer""","[9, 67]",First let me say this: I AM NOT A KATHARINE HE...,1.0,OVERRATED FILM!!,1091577600,"08 4, 2004"
1,A3HM5G9IV1FTFP,B000FS9UK8,Douglas King,"[3, 7]","I loved the original ""Basic Instinct"". Sure, i...",1.0,What a shame!,1144022400,"04 3, 2006"
2,A4Z9NHOW5LT0M,B000EHQTT0,JoB,"[0, 0]",The movie starts off super slow and basically ...,1.0,Really Bad!,1384819200,"11 19, 2013"
3,A30Q6TIQMXGATR,B000056T5E,pablo,"[2, 4]",here we are to rate this sort of horror film.i...,1.0,horrible!,1002240000,"10 5, 2001"
4,A207IYGZTAF0DS,B0001CVB0U,NoWireHangers,"[1, 8]","From the description of this movie, it sounds ...",1.0,Very bad movie. Not what you expected.,1151193600,"06 25, 2006"


In [None]:
df = pd.concat([text[:50000], text[:50000]], axis=1)
print(df.shape)
df.columns=['count', 'tfidf']
print(df.describe())
mapper = DataFrameMapper([
    ('count', CountVectorizer()),
    ('tfidf', TfidfVectorizer())
])


In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
count    50000 non-null object
tfidf    50000 non-null object
dtypes: object(2)
memory usage: 98.8 MB


In [None]:
features = mapper.fit_transform(df)

In [None]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "
def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    return reviews


In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
# get the total amoung of distinct "words" in Amazons, cellphone set und im movieset
results = set()
data['reviewText'].str.split().apply(results.update)
print(len(results))

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))


In [9]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

SyntaxError: invalid syntax (<ipython-input-9-27cd4a422c43>, line 3)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
# text = ["The quick brown fox jumped over the lazy dog. The cat isn't keen about it. It just wants to eat"]
text = data['reviewText']
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
vocab = vectorizer.vocabulary_

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
print(len(vocab))

815663


In [6]:
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 17)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2 1 1 1 1 1 1 3 1 1]]


In [7]:
print(vector)




  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	2
  (0, 8)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	1
  (0, 14)	3
  (0, 15)	1
  (0, 16)	1
