In [154]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rle0502/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading the dataset

##### The load files  function will loop through all the different directories contained in the given folder, and for each of the different subdirectories contained in this directory it will generate 2 classes so for neg folder it will generate a class 0 and for pos it will generate a class 1.
##### It will put all the files from the neg as class 0 and all the pos files as class 1


In [155]:
#Importing our review datasets
reviews = load_files('txt_sentoken/')


In [156]:
#Next will seperate the data and class from the reviews
X,y = reviews.data, reviews.target

## Persisiting datasets

#### This load_files function will be slower in case of the data is very large, hence to make this process faster we store the X and y as the pickle files which are byte type files.

In [157]:
#Storing as pickle files
with open('X.pickle','wb') as f:
    pickle.dump(X,f)

In [158]:
with open('y.pickle', 'wb') as f:
    pickle.dump(y,f)

In [159]:
#To unpickle the loaded data
with open('X.pickle', 'rb') as f:
    X = pickle.load(f)

In [160]:
with open('y.pickle', 'rb') as f:
    y = pickle.load(f)

## Prerocessing the data

##### We will create a corpus list of documents) list which  will contain all the preprocessed documents.

In [161]:
#Creating the corpus
corpus = []
for i in range(0, len(X)):
    #Removing all the punctuation and special characters
    review = re.sub(r'\W', ' ', str(X[i]))
    review = review.lower()
    #Removing single character as they are not important for text classfication
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^[a-z]\s+', ' ',review)
    #Removing extra spaces from the begining of the sentence
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)

## Transforming data into Bag of words model

In [162]:
# from sklearn.feature_extraction.text import CountVectorizer

In [163]:
# vectorizer = CountVectorizer(max_features=3000, min_df=3, max_df= 0.6, stop_words=stopwords.words('english'))
# #Bag of words model
# X = vectorizer.fit_transform(corpus).toarray()

## Transforming bag of words model into TF-IDF model

In [164]:
# from sklearn.feature_extraction.text import TfidfTransformer

In [165]:
# transformer = TfidfTransformer()
# X = transformer.fit_transform(X).toarray()

## Creating a TF-IDF vectorizer

In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [167]:
vectorizer = TfidfVectorizer(max_features=2000, min_df=3, max_df= 0.6, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

## Creating Training Set and Testing Set

In [168]:
from sklearn.model_selection import train_test_split

In [169]:
text_train, test_train, sent_train, sent_test=train_test_split(X, y, test_size=0.2, random_state=42)

## Training our classifier

In [170]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [171]:
classifier.fit(text_train, sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Testing Model performance

In [172]:
sent_pred = classifier.predict(test_train)

In [173]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [174]:
cm = confusion_matrix(sent_test, sent_pred)

In [175]:
cm

array([[158,  32],
       [ 34, 176]])

In [176]:
ascore = accuracy_score(sent_test, sent_pred)

In [177]:
ascore

0.835

## Saving the model

In [178]:
#Pickling the classifier
# We need to save both the classifier and vectorizer for making future prediction
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f)

In [179]:
with open('tfidfmode.pickle', 'wb') as f:
    pickle.dump(vectorizer, f)

## Importing and using our model

In [180]:
with open('classifier.pickle', 'rb') as f:
    clf = pickle.load(f)

In [181]:
with open('tfidfmode.pickle', 'rb') as f:
    tfidf = pickle.load(f)

In [203]:
sample = ["You are very bad person"]

In [204]:
sample = tfidf.transform(sample).toarray()

In [205]:
clf.predict(sample)

array([0])