In [1]:
# imports and settings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from funs import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC


pd.set_option('display.max_rows', 5)

In [2]:
# Reading data
file_path = './train.csv'
DF = pd.read_csv(file_path, lineterminator='\n')

In [3]:
DF.head()
# DF.shape
# DF.columns

X = DF['review']
y = DF['label'] == 'Positive'

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
# data cleaning
num_reviews = X_train.size
clean_train_reviews = []
for i in range(num_reviews):
    if i % 1000 == 0:
        print('Processing review %d of %f' % (i, num_reviews))
    clean_train_reviews.append(process_review(X_train.iloc[i]))
len(clean_train_reviews)

Processing review 0 of 4746.000000
Processing review 1000 of 4746.000000
Processing review 2000 of 4746.000000
Processing review 3000 of 4746.000000
Processing review 4000 of 4746.000000


4746

In [5]:
# Creating bag of words
vectorizer = CountVectorizer(analyzer='word', max_features=500)

In [6]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [7]:
train_data_features.shape

(4746, 500)

In [8]:
vocab = vectorizer.get_feature_names()# 1 * 10000 list
# vocab

In [9]:
# dist = np.sum(train_data_features, axis=0)
# for tag, count in zip(vocab, dist):
#     print(tag, count)

In [10]:
# testing on random forest
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, y_train)

In [11]:
# testing AUC
num_reviews = X_test.size
clean_test_reviews = []
for i in range(num_reviews):
    clean_test_reviews.append(process_review(X_test.iloc[i]))

test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [12]:
# train_predictions = forest.predict(train_data_features)
train_predictions = forest.predict_proba(train_data_features)
# test_predictions = forest.predict(test_data_features)
test_predictions = forest.predict_proba(test_data_features)
# print (test_predictions, test_predictions[:, 1])

In [13]:
train_score = roc_auc_score(y_train, train_predictions[:, 1])
test_score = roc_auc_score(y_test, test_predictions[:, 1])
print ('train auc: %.3f, test auc: %.3f' % (train_score, test_score))
# not good

train auc: 0.998, test auc: 0.771


In [16]:
# very slow
svc = SVC(probability=True)
svc.fit(train_data_features, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [22]:
train_predictions = svc.predict_proba(train_data_features)
train_predictions
test_predictions = svc.predict_proba(test_data_features)
train_score = roc_auc_score(y_train, train_predictions[:, 1])
test_score = roc_auc_score(y_test, test_predictions[:, 1])

In [21]:
print ('train auc: %.3f, test auc: %.3f' % (train_score, test_score))

train auc: 0.764, test auc: 0.771
