In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, FreqDist

In [3]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df = df.replace(np.nan, "")
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [5]:
df.shape

(23486, 11)

In [6]:
df.dtypes

Unnamed: 0                  int64
Clothing ID                 int64
Age                         int64
Title                      object
Review Text                object
Rating                      int64
Recommended IND             int64
Positive Feedback Count     int64
Division Name              object
Department Name            object
Class Name                 object
dtype: object

In [7]:
tokens = word_tokenize(df['Review Text'].to_string())

In [8]:
df['Review Text'].astype(str)

0        Absolutely wonderful - silky and sexy and comf...
1        Love this dress!  it's sooo pretty.  i happene...
2        I had such high hopes for this dress and reall...
3        I love, love, love this jumpsuit. it's fun, fl...
4        This shirt is very flattering to all due to th...
                               ...                        
23481    I was very happy to snag this dress at such a ...
23482    It reminds me of maternity clothes. soft, stre...
23483    This fit well, but the top was very see throug...
23484    I bought this dress for a wedding i have this ...
23485    This dress in a lovely platinum is feminine an...
Name: Review Text, Length: 23486, dtype: object

In [10]:
df.dtypes

Unnamed: 0                  int64
Clothing ID                 int64
Age                         int64
Title                      object
Review Text                object
Rating                      int64
Recommended IND             int64
Positive Feedback Count     int64
Division Name              object
Department Name            object
Class Name                 object
dtype: object

In [11]:
len(tokens)

282097

In [12]:
len(set(tokens))

31274

In [13]:
WNlemma = nltk.WordNetLemmatizer()
from nltk.corpus import wordnet
from nltk import pos_tag


In [14]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [15]:
def pre_process_with_pos_tag(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [WNlemma.lemmatize(t, get_wordnet_pos(pos_tag(word_tokenize(t))[0][1])) for t in tokens]
    text_after_process = " ".join(tokens)
    return text_after_process

In [16]:
review_text_processed = df['Review Text'].apply(pre_process_with_pos_tag)
review_text_processed

0        absolutely wonderful silky and sexy and comfor...
1        love this dress sooo pretty happen find store ...
2        have such high hope for this dress and really ...
3        love love love this jumpsuit fun flirty and fa...
4        this shirt very flatter all due the adjustable...
                               ...                        
23481    be very happy snag this dress such great price...
23482    reminds maternity clothes soft stretchy shiny ...
23483    this fit well but the top be very see through ...
23484    bought this dress for wedding have this summer...
23485    this dress lovely platinum feminine and fit pe...
Name: Review Text, Length: 23486, dtype: object

In [35]:
review_text_processed_df = pd.DataFrame(
    {
        'review_text_processed':review_text_processed,
        'recommended':df['Recommended IND']
    },
    columns = ['review_text_processed','recommended']
)
review_text_processed_df.head()

Unnamed: 0,review_text_processed,recommended
0,absolutely wonderful silky and sexy and comfor...,1
1,love this dress sooo pretty happen find store ...,1
2,have such high hope for this dress and really ...,0
3,love love love this jumpsuit fun flirty and fa...,1
4,this shirt very flatter all due the adjustable...,1


In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(review_text_processed_df.review_text_processed,
                                                    review_text_processed_df.recommended,
                                                    test_size = 0.2,
                                                    random_state = 5205
                                                   )

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(18788, 10511)

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(18788, 10511)

In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

nb_clf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB())
                  ])
nb_clf.fit(X_train, y_train)
nb_predicted = nb_clf.predict(X_test)

print(metrics.confusion_matrix(y_test, nb_predicted))
print(np.mean(nb_predicted==y_test))
print(metrics.classification_report(y_test, nb_predicted))

[[  21  800]
 [   0 3877]]
0.8297147722435079
              precision    recall  f1-score   support

           0       1.00      0.03      0.05       821
           1       0.83      1.00      0.91      3877

    accuracy                           0.83      4698
   macro avg       0.91      0.51      0.48      4698
weighted avg       0.86      0.83      0.76      4698



In [18]:
review_text = np.array(df['Review Text'])

In [19]:
for i in range(len(review_text)):
    review_text[i] = review_text[i].lower()

In [20]:
review_text_combined = " ".join(review for review in review_text)
print ("There are {} words in the combination of all reviews.".format(len(review_text_combined)))

There are 7012488 words in the combination of all reviews.


In [23]:
department = np.array(df['Department Name'])

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(review_text, department, test_size=0.2, random_state=42)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
#For the purpose of this exercise keep stop words
count_vect = CountVectorizer()

In [26]:
X_train_counts = count_vect.fit_transform(X_train)
#See shape
X_train_counts.shape

(18788, 12877)

In [27]:
#Fix weight of words based on length of text
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [28]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(18788, 12877)

In [29]:
from sklearn.linear_model import SGDClassifier

In [30]:
#Create the classifer
clf_sgd= SGDClassifier(loss= 'hinge', penalty='l2', alpha= 1e-4, max_iter= 5, learning_rate='optimal')
#Fit to training data
clf_sgd.fit(X_train_tfidf, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [32]:
#Predict using SVM
X_test_counts = count_vect.transform(X_test)
#Normalize test set
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf.shape
sgd_predict = clf_sgd.predict(X_test_tfidf)

In [33]:
np.mean(sgd_predict == y_test)

0.8263090676883781

In [34]:
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=5205)
#Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

NameError: name 'X' is not defined