In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)

def read_text_file(f):
    df_complete = pd.read_csv(f)
    df = df_complete.loc[:,["Text","Score"]]
    df.dropna(how="any", inplace=True)    
    return df

df = read_text_file(r"C:\Users\HPPC\Reviews.csv")
print (df.head())

                                                Text  Score
0  I have bought several of the Vitality canned d...      5
1  Product arrived labeled as Jumbo Salted Peanut...      1
2  This is a confection that has been around a fe...      4
3  If you are looking for the secret ingredient i...      2
4  Great taffy at a great price.  There was a wid...      5


In [2]:
def sampling_dataset(df):
    count = 5000
    class_df_sampled = pd.DataFrame(columns = ["Score","Text"])
    temp = []
    for c in df.Score.unique():
        class_indexes = df[df.Score == c].index
        random_indexes = np.random.choice(class_indexes, count, replace=False)
        temp.append(df.loc[random_indexes])
        
    for each_df in temp:
        class_df_sampled = pd.concat([class_df_sampled,each_df],axis=0)
    
    return class_df_sampled

df = sampling_dataset(df)
df.reset_index(drop=True,inplace=True)
print (df.head())
print (df.shape)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


  Score                                               Text
0     5  ...at least I hope it remains available. It is...
1     5  My grandson is a beef jerky addict. We don't h...
2     5  This coffee has a really great taste asnd if v...
3     5  Healthy edibles are a great treat for our 5 ye...
4     5  I have never been one to buy chicken in a can ...
(25000, 2)


In [19]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re

lmtzr = WordNetLemmatizer()
w = re.compile("\w+",re.I)

def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w,datapoint["Text"].lower())
        labeled_sentences.append(LabeledSentence(words=tokenized_words, tags=['SENT_%s' %index]))
    return labeled_sentences

def train_doc2vec_model(labeled_sentences):
    model = Doc2Vec(alpha=0.025, min_alpha=0.025)
    model.build_vocab(labeled_sentences)
    for epoch in range(10):
        model.train(labeled_sentences,total_examples =model.corpus_count,epochs=model.epochs)
        model.alpha -= 0.002 
        model.min_alpha = model.alpha
        
    return model



In [17]:

sen = label_sentences(df)


  from ipykernel import kernelapp as app


In [20]:
model = train_doc2vec_model(sen)

In [21]:
def vectorize_comments(df,d2v_model):
    y = []
    comments = []
    for i in range(0,df.shape[0]):
        label = 'SENT_%s' %i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments
    
    return df

df = vectorize_comments(df,model)
print (df.head(2))

  Score                                               Text  \
0     5  ...at least I hope it remains available. It is...   
1     5  My grandson is a beef jerky addict. We don't h...   

                                 vectorized_comments  
0  [-0.2687241, 0.39540887, -1.1514533, 1.1078795...  
1  [-0.52334917, 0.07890725, -0.48016855, 1.73405...  


In [47]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle




df['Score'].astype('int')
df['Score'].shape

(25000,)

In [51]:
def train_classifier(X,y):
    n_estimators = [200,400]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]

    parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                  'min_samples_split': min_samples_split}

    clf = GridSearchCV(RFC(verbose=1,n_jobs=4), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf

X_train, X_test, y_train, y_test = train_test_split(df["vectorized_comments"].T.tolist(), df["Score"].T.tolist(), test_size=0.02, random_state=17)
classifier = train_classifier(X_train,y_train)
print (classifier.best_score_, "----------------Best Accuracy score on Cross Validation Sets")
print (classifier.score(X_test,y_test))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  2.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    2.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    3.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 

0.36248979591836733 ----------------Best Accuracy score on Cross Validation Sets


[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s


0.378


[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    0.5s finished


In [46]:
y_train.isnull().sum()

0