In [4]:
import pandas as pd
from sklearn import metrics

In [5]:
def readPrintTSV(dir):
  data = pd.read_csv(dir)
  print(f"imported {data.shape[0]} rows of {data.shape[1]} columns")
  print()
  print(data[:5])
  return data

In [6]:
salminen = readPrintTSV('Salminen Dataset.csv')

imported 40432 rows of 4 columns

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [7]:
def checkremovenull(data):
  initialrows = data.shape[0]
  droppedrows = data.dropna(inplace = True)
  newrows = data.shape[0]
  if newrows < initialrows:
    print(f"{initialrows - newrows} nulls removed")
  else:
    print("no nulls detected")
  return droppedrows

def checkremoveblankstr(data):
  toDrop = []
  for row in data.itertuples():
    for entry in row:
      if (type(entry) == str and entry.isspace()):
        break
    else:
      continue
    toDrop.append(entry[0]) #row number
  if (len(toDrop) == 0):
    print("no empty strings detected")
    return None
  else:
    print(f"{len(toDrop)} rows with empty strings removed")
    return data.drop(toDrop, inplace = True)


checkremovenull(salminen)
checkremoveblankstr(salminen)

no nulls detected
no empty strings detected


In [8]:
from sklearn.model_selection import train_test_split

X = salminen['text_']
y = salminen['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [9]:
X_train[:10]

16635    Worked fine.  Some post commented on foot peda...
14695    This is a wonderfully insightful and thought p...
29502    Im a firm believer in a workbook for every kid...
38668    Wow, it really is thin and low profile. The mo...
21585    Good for labs, beagles...all furr kiddos with ...
30157    These novella add clarity and an additional la...
4615     Great water bottle.. Although the cap is cheap...
69       My hot cocoa is finally perfect! This product ...
4807     It's perfect, covers my beach time perfectly, ...
21445    For the price, this hutch is excellent. Not su...
Name: text_, dtype: object

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

model = Pipeline([ ('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC()), ])

In [11]:
model.fit(X_train, y_train)

In [12]:
#saving model
import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']

# Model Loading and Running

Use constructor to pass relative directory. Put a Pandas Dataframe into `predict` (or just treat the object as a function) to get a Pandas Series of predictions.

Model only accepts a 1 dimensional input of just review text.

In [13]:
import joblib
class Model:

  def __init__(self, dir):
    self.model = joblib.load(dir)

  def __call__(self, inputs):
    return self.model.predict(inputs)

  def predict(self, inputs):
    return self(inputs)

In [14]:
print(metrics.classification_report(salminen['label'], Model('model.pkl')(salminen['text_'])))

              precision    recall  f1-score   support

          CG       0.95      0.95      0.95     20216
          OR       0.95      0.95      0.95     20216

    accuracy                           0.95     40432
   macro avg       0.95      0.95      0.95     40432
weighted avg       0.95      0.95      0.95     40432



In [15]:
model = Model('model.pkl')
outputs = model(X_test)
print(metrics.confusion_matrix(y_test,outputs))
print(metrics.classification_report(y_test,outputs))

[[6082  621]
 [ 640 6000]]
              precision    recall  f1-score   support

          CG       0.90      0.91      0.91      6703
          OR       0.91      0.90      0.90      6640

    accuracy                           0.91     13343
   macro avg       0.91      0.91      0.91     13343
weighted avg       0.91      0.91      0.91     13343

