In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline

In [2]:
#Creating a function to load the text and labels from train and test set

def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
file_path = 'E:/Backup/Project/Machine Learning/Natural language processing/'
train_labels, train_texts = get_labels_and_texts(file_path+'train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts(file_path+'test.ft.txt.bz2')

In [3]:
print('label: ',train_labels[0])
train_texts[0]

label:  1


'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [4]:
# only using first 500 data
train_labels=train_labels[0:500]
train_texts=train_texts[0:500]

In [5]:
#text pre-processing
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')

def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [6]:
# countvectorizer : number of times a word occur.
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(train_texts)
X = cv.transform(train_texts)
X_test = cv.transform(test_texts)

In [7]:
print(X_test)

  (0, 213)	1
  (0, 282)	1
  (0, 398)	1
  (0, 503)	1
  (0, 506)	1
  (0, 511)	1
  (0, 525)	1
  (0, 561)	1
  (0, 761)	1
  (0, 1305)	1
  (0, 1698)	1
  (0, 1701)	1
  (0, 1857)	1
  (0, 1864)	1
  (0, 1973)	1
  (0, 2090)	1
  (0, 2156)	1
  (0, 2193)	1
  (0, 2279)	1
  (0, 2288)	1
  (0, 2331)	1
  (0, 2525)	1
  (0, 2678)	1
  (0, 2681)	1
  (0, 2689)	1
  :	:
  (399999, 3817)	1
  (399999, 3987)	1
  (399999, 4059)	1
  (399999, 4194)	1
  (399999, 4363)	1
  (399999, 4413)	1
  (399999, 4667)	1
  (399999, 4773)	1
  (399999, 4888)	1
  (399999, 5087)	1
  (399999, 5090)	1
  (399999, 5094)	1
  (399999, 5100)	1
  (399999, 5104)	1
  (399999, 5107)	1
  (399999, 5125)	1
  (399999, 5186)	1
  (399999, 5191)	1
  (399999, 5455)	1
  (399999, 5540)	1
  (399999, 5609)	1
  (399999, 5610)	1
  (399999, 5620)	1
  (399999, 5649)	1
  (399999, 5759)	1


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, train_labels, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    # C : how much you want to regularize the model on to the data.
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.744
Accuracy for C=0.05: 0.776
Accuracy for C=0.25: 0.776
Accuracy for C=0.5: 0.784
Accuracy for C=1: 0.784


In [9]:
lr.predict(X_test[29])

array([0])

In [10]:
print('label: ',test_labels[29])
test_texts[29]

label:  0


'three days of use and it broke  very disappointed in this product  it worked perfectly for exactly three days and could not be resuscitated  it was very inexpensive so i did not want to pay half again the price to ship it back for an exchange  so the company would do nothing when they sent me an inquiry as to product satisfaction '