In [6]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv("data.csv")

In [23]:
data['Positively Rated'] = np.where(data['Rating'] > 3, 1, 0)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['Reviews'], data['Positively Rated'], random_state = 0)

In [32]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)

In [47]:
vect.get_feature_names()

X_train_vectorized = vect.transform(X_train)

In [49]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [50]:
from sklearn.metrics import roc_auc_score

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.5625


In [51]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['have' 'not' 'but' 'mom' 'so' 'loves' 'she' 'network' 'cannot' 'connect']

Largest Coefs: 
['excelente' 'great' 'excellent' 'and' 'perfect' 'price' 'recommend' 'my'
 'other' 'at']



In [54]:
# These reviews are treated the same by our current model

print(model.predict(vect.transform(['network', 
                                   'an issue, phone is not working'])))

[1 1]
