### Import the libraries

In [1]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Import datasets

In [2]:
reviews = load_files('path_to_file txt_sentoken')

In [3]:
X, y = reviews.data, reviews.target

In [4]:
y

array([0, 1, 1, ..., 1, 0, 0])

### Persist the dataset in pickle files
load_files method can take long time if we have lot of data in files, e.g. 50k data. So to quickly access the data we will store them in pickle files in bytes format

In [5]:
# Storing as pickle files

with open('X.pickle','wb') as f:
    pickle.dump(X, f)
with open('y.pickle','wb') as f:
    pickle.dump(y, f)

In [6]:
# Reading from pickle files
del X
del y

with open('X.pickle','rb') as f:
    X = pickle.load(f)
with open('y.pickle','rb') as f:
    y = pickle.load(f)

In [7]:
y

array([0, 1, 1, ..., 1, 0, 0])

In [8]:
X[0]

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is 

### Creating Corpus
Corpus is a list of documents

In [9]:
corpus = []
for i in range(0, len(X)):
    review = re.sub(r'\W', ' ', str(X[i])) # remove all punctuations or not word characters
    review = review.lower() # lowercase the string
    review = re.sub(r'\s+[a-z]\s+', ' ', review) # remove single charactes (like i, a as they do not contribute to meaning much) from middle of the sentence, so we are replacting ' a ' by ' '
    review = re.sub(r'^[a-z]\s+', ' ', review) # remove single character from start of the sentence, so we are replacing 'i ' with ' '
    review = re.sub(r'\s+', ' ', review) # replace multiple spaces with a single space
    review = review.strip() # remove extra spaces from start and end of the string
    corpus.append(review)
    

In [10]:
' '.join(str(X[0]).split('\\n')) # remove \n from strings

'b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80\'s , but lately his films have been very sloppy and the one-liners are getting worse .  it\'s hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what\'s it matter to him ?  once again arnold has signed to do another expensive blockbuster , that can\'t compare with the likes of the terminator series , true lies and even eraser .  in this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) .  with the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world !  parts of this are actually so absurd , that they would fit right in with dogma .  yes , the film is t

### Create BOW model

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
## max_features = selects top 2000 words from the histogram
## min_df (minimum document frequency) = select words which is present in more than min_df documents.  
##    Here we will words which are present in more than3 documents and discard words which are present 3 or less documents
## max_df (maximum document frequency) = we will exclude all the words that appears >= max_df documnets
##                                       represented as percentage. example: 'the', 'that'
## stopwords = remove these stopwords

## So vectorized will select words after applying min_df, max_df, stopwords and after that select top 2000 most frequent words
vectorizer = CountVectorizer(max_features=2000, min_df=3, max_df=0.6, stop_words=stopwords.words('english'))

X = vectorizer.fit_transform(corpus).toarray()

In [12]:
X.shape # (documents, words)

(2000, 2000)

### Transform to TF-IDF model

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()

In [14]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12007883, 0.        , 0.06321361, ..., 0.        , 0.        ,
        0.        ]])

### Split dataset into training and test sets

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1600, 2000), (400, 2000), (1600,), (400,))

### Build Logistic Regression Model for sentiment classification

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [18]:
pred = model.predict(X_test)

In [19]:
len(pred)

400

In [20]:
from sklearn.metrics import confusion_matrix
cn = confusion_matrix(y_test, pred)
cn

array([[168,  40],
       [ 21, 171]], dtype=int64)

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.8475

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.81      0.85       208
           1       0.81      0.89      0.85       192

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



### Save our classifier and TfidfVectorizer

In [23]:
with open('classifier.pickle', 'wb') as f:
    pickle.dump(model, f)

In [28]:
### save the vectprizer

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000, min_df=3, max_df=0.6, stop_words=stopwords.words('english'))

X = vectorizer.fit_transform(corpus).toarray()
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12007883, 0.        , 0.06321361, ..., 0.        , 0.        ,
        0.        ]])

In [29]:
with open('vectorizer.pickle', 'wb') as f:
    pickle.dump(vectorizer, f)

### Import classifier and vectorizer

In [30]:
with open('classifier.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('vectorizer.pickle', 'rb') as f:
    tfidf = pickle.load(f)

In [32]:
sample = ["You are a nice person man, have a good life."]

In [33]:
sample = tfidf.transform(sample).toarray()

In [34]:
clf.predict(sample)

array([1])

In [36]:
sample2=["You are a bad bad person."]
sample2 = tfidf.transform(sample2).toarray()
clf.predict(sample2)

array([0])