# Logistic Regression Baseline
* assume data has already been cleaned

## To Do:
- run classifier
- return metrics
- consolidate code into methods so it can be called on multiple datasets

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV # CV is cross validation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn import svm

In [2]:
import urllib.request
import os.path

year='2014'
path='data\\processed\\'
if not os.path.exists(path+'yelp_'+year+'_train.csv'):
    train = urllib.request.urlretrieve ('https://www.cs.mcgill.ca/~glluch/yelp_'+year+'/yelp_'+year+'_train.csv', path+'yelp_'+year+'_train.csv')
else:
    train = urllib.request.urlretrieve('file:'+os.path.abspath(path+'yelp_'+year+'_train.csv'))
    
if not os.path.exists(path+'yelp_'+year+'_dev.csv'):
    valid = urllib.request.urlretrieve ('https://www.cs.mcgill.ca/~glluch/yelp_'+year+'/yelp_'+year+'_dev.csv', path+'yelp_'+year+'_dev.csv')
else:
    valid = urllib.request.urlretrieve('file:'+os.path.abspath('data/processed/yelp_'+year+'_dev.csv'))
    
if not os.path.exists(path+'yelp_'+year+'_test.csv'):
    test = urllib.request.urlretrieve ('https://www.cs.mcgill.ca/~glluch/yelp_'+year+'/yelp_'+year+'_test.csv', path+'yelp_'+year+'_test.csv')
else:
    test = urllib.request.urlretrieve('file:'+os.path.abspath(path+'yelp_'+year+'_test.csv'))

In [3]:
#load data
train = pd.read_csv(train[0])
test = pd.read_csv(test[0])
valid = pd.read_csv(valid[0])

Xtrain = train['text']
y_train = train['rating']
Xtest = test['text']
y_test= test['rating']
Xvalid = valid['text']
y_valid = valid['rating']

train = []
valid = []
test = []

In [4]:
def logReg(X_train, y_train, X_test, y_test, cv=5):
    logreg = LogisticRegressionCV(cv=cv, multi_class='multinomial').fit(X_train, y_train)
    y_hat=logreg.predict(X_test)

    print(confusion_matrix(y_test, y_hat))
    print('\n')
    print(classification_report(y_test, y_hat))

    print(accuracy_score(y_test, y_hat))
    return logreg.get_params()

In [5]:
max_vocab = 50000
n=2

In [6]:
bow_transformer = CountVectorizer(max_features=max_vocab, binary=True).fit(Xtrain.values.astype('U'))
X_train = bow_transformer.transform(Xtrain.values.astype('U'))
X_valid = bow_transformer.transform(Xvalid.values.astype('U'))
X_test = bow_transformer.transform(Xtest.values.astype('U'))

In [9]:
k=logReg(X_train,y_train, X_test, y_test)



[[ 8063  1513   476   469   557]
 [ 2362  3602  2504  1244   562]
 [  697  1730  6301  6135  1514]
 [  269   380  2825 19007 11734]
 [  296   116   445  8061 31687]]


              precision    recall  f1-score   support

           1       0.69      0.73      0.71     11078
           2       0.49      0.35      0.41     10274
           3       0.50      0.38      0.44     16377
           4       0.54      0.56      0.55     34215
           5       0.69      0.78      0.73     40605

   micro avg       0.61      0.61      0.61    112549
   macro avg       0.58      0.56      0.57    112549
weighted avg       0.60      0.61      0.60    112549

0.6100454024469342


In [10]:
tf_transformer = TfidfTransformer(use_idf=False, smooth_idf=False, norm='l1') # check settings
X_train = tf_transformer.fit_transform(X_train)
X_valid = tf_transformer.transform(X_valid)
X_test = tf_transformer.transform(X_test)

In [11]:
k=logReg(X_train,y_train, X_test, y_test)



[[ 8088  1563   489   405   533]
 [ 2407  3375  2611  1331   550]
 [  774  1580  5866  6780  1377]
 [  375   383  2496 20246 10715]
 [  452   136   453  9118 30446]]


              precision    recall  f1-score   support

           1       0.67      0.73      0.70     11078
           2       0.48      0.33      0.39     10274
           3       0.49      0.36      0.41     16377
           4       0.53      0.59      0.56     34215
           5       0.70      0.75      0.72     40605

   micro avg       0.60      0.60      0.60    112549
   macro avg       0.57      0.55      0.56    112549
weighted avg       0.60      0.60      0.60    112549

0.6043678753254138


In [6]:
ngram_transformer = CountVectorizer(max_features=max_vocab, ngram_range=(1,n), binary=True).fit(Xtrain.values.astype('U'))

X_train = ngram_transformer.transform(Xtrain.values.astype('U'))
X_valid = ngram_transformer.transform(Xvalid.values.astype('U'))
X_test = ngram_transformer.transform(Xtest.values.astype('U'))

In [7]:
k=logReg(X_train,y_train, X_test, y_test)



[[ 8182  1650   445   358   443]
 [ 2243  4136  2534   938   423]
 [  644  1835  6984  5650  1264]
 [  238   329  2858 19634 11156]
 [  225    92   385  7924 31979]]


              precision    recall  f1-score   support

           1       0.71      0.74      0.72     11078
           2       0.51      0.40      0.45     10274
           3       0.53      0.43      0.47     16377
           4       0.57      0.57      0.57     34215
           5       0.71      0.79      0.74     40605

   micro avg       0.63      0.63      0.63    112549
   macro avg       0.61      0.59      0.59    112549
weighted avg       0.62      0.63      0.62    112549

0.6300811202231916


In [8]:
tf_transformer = TfidfTransformer(use_idf=False, smooth_idf=False, norm='l1') # check settings
X_train = tf_transformer.fit_transform(X_train)
X_valid = tf_transformer.transform(X_valid)
X_test = tf_transformer.transform(X_test)

In [9]:
k=logReg(X_train,y_train, X_test, y_test)



[[ 8146  1567   542   383   440]
 [ 2260  3565  2812  1180   457]
 [  692  1430  6527  6611  1117]
 [  320   294  2435 21100 10066]
 [  385   111   410  9074 30625]]


              precision    recall  f1-score   support

           1       0.69      0.74      0.71     11078
           2       0.51      0.35      0.41     10274
           3       0.51      0.40      0.45     16377
           4       0.55      0.62      0.58     34215
           5       0.72      0.75      0.74     40605

   micro avg       0.62      0.62      0.62    112549
   macro avg       0.60      0.57      0.58    112549
weighted avg       0.62      0.62      0.62    112549

0.621622582164213
