In [2]:
%pylab inline
import pandas as pd
import csv
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import string

Populating the interactive namespace from numpy and matplotlib


In [3]:
#read csv
amazon_df = pd.read_csv('/media/backup/Data/Amazon/amazon_male.csv', sep='|',encoding='utf8',quoting=csv.QUOTE_NONE, dtype={'Overall_Rating':float, 'Timestamp': int})

In [4]:
#arranging row index
amazon_df.index = amazon_df.index+1

In [5]:
amazon_df.head()

Unnamed: 0,Reviewer_Id,Product_Id,Name,Gender,Helpfulness,Review,Overall_Rating,Timestamp
1,A00000262KYZUE4J55XGL,B003UYU16G,Steven N Elich,male,0,It is and does exactly what the description sa...,5.0,1353456000
2,A00000922W28P2OCH6JSE,B000VEBG9Y,Gabriel Merrill,male,0,Very mobile product. Efficient. Easy to use; h...,3.0,1395619200
3,A00000922W28P2OCH6JSE,B001EJMS6K,Gabriel Merrill,male,0,Easy to use a mobile. If you're taller than 4f...,4.0,1395619200
4,A00000922W28P2OCH6JSE,B003XJCNVO,Gabriel Merrill,male,0,Love this feeder. Heavy duty & capacity. Best ...,4.0,1395619200
5,A00000922W28P2OCH6JSE,B00G3KN9LI,Gabriel Merrill,male,0,"Solid, stable mount. Holds iPhone with phone p...",4.0,1396051200


In [6]:
#amazon_df.Review.to_csv(Reviews.csv)
amazon_df.shape

(16950602, 8)

In [7]:
amazon_df.drop(['Reviewer_Id','Product_Id', 'Name', 'Helpfulness', 'Overall_Rating', 'Timestamp'], axis=1, inplace=True)    

In [8]:
amazon_df = amazon_df.iloc[:50000]

In [9]:
amazon_df.head()

Unnamed: 0,Gender,Review
1,male,It is and does exactly what the description sa...
2,male,Very mobile product. Efficient. Easy to use; h...
3,male,Easy to use a mobile. If you're taller than 4f...
4,male,Love this feeder. Heavy duty & capacity. Best ...
5,male,"Solid, stable mount. Holds iPhone with phone p..."


# Female Data

In [10]:
amazon_df_f = pd.read_csv('/media/backup/Data/Amazon/amazon_female.csv', sep='|',encoding='utf8',quoting=csv.QUOTE_NONE, dtype={'Overall_Rating':float, 'Timestamp': int})

In [11]:
#arranging row index
amazon_df_f.index = amazon_df_f.index+1

In [12]:
amazon_df_f.head()

Unnamed: 0,Reviewer_Id,Product_Id,Name,Gender,Helpfulness,Review,Overall_Rating,Timestamp
1,A00001362Q1PGIX2FYSSH,B00125OS3C,Pamela Bellamy,female,0,Beautiful photos/film with wonderful music. G...,5.0,1377907200
2,A00001362Q1PGIX2FYSSH,B002IGHX40,Pamela Bellamy,female,11,My idea of Colorado is &#34;Mountains&#34;. C...,2.0,1377907200
3,A00003262KNLZOSMMMFVV,B002Y2U8MC,Harue Rojas,female,11,"It is not a sticker, it is a Chritsmas story ...",5.0,1361145600
4,A00003262KNLZOSMMMFVV,B004P598FY,Harue Rojas,female,11,"LOve the size and the details, and its very co...",5.0,1361145600
5,A00003262KNLZOSMMMFVV,B005MU3UE6,Harue Rojas,female,11,Its very colorful and the image once you put...,5.0,1361145600


In [13]:
amazon_df_f.drop(['Reviewer_Id','Product_Id', 'Name', 'Helpfulness', 'Overall_Rating', 'Timestamp'], axis=1, inplace=True)    

In [14]:
amazon_df_f = amazon_df_f.iloc[:50000]

In [15]:
amazon_df_f.shape

(50000, 2)

In [16]:
amazon_df_f.head()

Unnamed: 0,Gender,Review
1,female,Beautiful photos/film with wonderful music. G...
2,female,My idea of Colorado is &#34;Mountains&#34;. C...
3,female,"It is not a sticker, it is a Chritsmas story ..."
4,female,"LOve the size and the details, and its very co..."
5,female,Its very colorful and the image once you put...


In [17]:
data = amazon_df.append(amazon_df_f, ignore_index= True)

In [18]:
data.index = data.index + 1

In [19]:
data.shape

(100000, 2)

In [20]:
data = data.sample(frac = 1).reset_index(drop=True)

In [21]:
data.index = data.index +1

In [22]:
data.head()

Unnamed: 0,Gender,Review
1,female,This nose ring is too short. It hurt my nose a...
2,male,A great concept in mult-generational protagoni...
3,female,Wow. Wasn't expecting to receive the package s...
4,female,I will continue using this product because it ...
5,female,i was a bit dissapointed that the jersey did n...


# Data Preprocessing

In [23]:
#data preprocessing
porter_stemmer = nltk.PorterStemmer()
words = stopwords.words("english")

In [24]:
data['Review'] = data['Review'].apply(str)

In [25]:
#remove punctuation for each word
#maketrans() method returns a translation table that maps each character in the 
#intab string into the character at the same position in the outtab string
table = str.maketrans('','', string.punctuation)

In [26]:
#data cleaning and forming a new column
#remove words that are not alphanumeric eg. remove s of what's 

data['Cleaned_Reviews'] = data['Review'].apply(lambda x: " ".join([i.translate(table) for i in x.split() if i.isalpha() if i not in words]).lower())

In [27]:
data.head()

Unnamed: 0,Gender,Review,Cleaned_Reviews
1,female,This nose ring is too short. It hurt my nose a...,this nose ring it hurt nose cute post needs bi...
2,male,A great concept in mult-generational protagoni...,a great concept protagonist gameplay makes har...
3,female,Wow. Wasn't expecting to receive the package s...,expecting receive package i ordered friday aft...
4,female,I will continue using this product because it ...,i continue using product fragrance definitely ...
5,female,i was a bit dissapointed that the jersey did n...,bit dissapointed jersey tracking number took t...


In [28]:
#1 for female and 0 for male
data['Gender_conv'] = pd.get_dummies(data.Gender)['female']

In [29]:
data.head()

Unnamed: 0,Gender,Review,Cleaned_Reviews,Gender_conv
1,female,This nose ring is too short. It hurt my nose a...,this nose ring it hurt nose cute post needs bi...,1
2,male,A great concept in mult-generational protagoni...,a great concept protagonist gameplay makes har...,0
3,female,Wow. Wasn't expecting to receive the package s...,expecting receive package i ordered friday aft...,1
4,female,I will continue using this product because it ...,i continue using product fragrance definitely ...,1
5,female,i was a bit dissapointed that the jersey did n...,bit dissapointed jersey tracking number took t...,1


In [30]:
#improvements
#stemming the reviews

data['Cleaned_Reviews'] = data['Cleaned_Reviews'].apply(lambda x: " ".join([porter_stemmer.stem(word) for word in x.split()]))

In [31]:
data.head()

Unnamed: 0,Gender,Review,Cleaned_Reviews,Gender_conv
1,female,This nose ring is too short. It hurt my nose a...,thi nose ring it hurt nose cute post need bit ...,1
2,male,A great concept in mult-generational protagoni...,a great concept protagonist gameplay make hard...,0
3,female,Wow. Wasn't expecting to receive the package s...,expect receiv packag i order friday after i or...,1
4,female,I will continue using this product because it ...,i continu use product fragranc definit reorder...,1
5,female,i was a bit dissapointed that the jersey did n...,bit dissapoint jersey track number took till r...,1


In [32]:
data['Word_Count'] = data['Cleaned_Reviews'].str.split().str.len()

In [33]:
data.head()

Unnamed: 0,Gender,Review,Cleaned_Reviews,Gender_conv,Word_Count
1,female,This nose ring is too short. It hurt my nose a...,thi nose ring it hurt nose cute post need bit ...,1,14
2,male,A great concept in mult-generational protagoni...,a great concept protagonist gameplay make hard...,0,19
3,female,Wow. Wasn't expecting to receive the package s...,expect receiv packag i order friday after i or...,1,31
4,female,I will continue using this product because it ...,i continu use product fragranc definit reorder...,1,8
5,female,i was a bit dissapointed that the jersey did n...,bit dissapoint jersey track number took till r...,1,19


In [34]:
X = data[['Cleaned_Reviews', 'Word_Count']]
Y = data['Gender_conv']

#test train split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state = 0)

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [36]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('Cleaned_Reviews')),
            ('tfidf', TfidfVectorizer( stop_words="english",
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300))
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('Word_Count')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1))
    ])

In [37]:
classifier.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('colext', TextSelector(field='Cleaned_Reviews')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', inp...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [38]:
preds = classifier.predict(X_test)

  if diff:


In [39]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, preds))
print("Precision:", precision_score(y_test, preds))
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

# [[ true_negatives   false_positives]
#  [ false_negatives  true_positives]]

Accuracy: 0.6744
Precision: 0.6792626728110599
             precision    recall  f1-score   support

          0       0.67      0.69      0.68      9987
          1       0.68      0.66      0.67     10013

avg / total       0.67      0.67      0.67     20000

[[6855 3132]
 [3380 6633]]
