In [1]:
import nltk

In [2]:
positiveReviewsFileName =  'C:/Users/Sheik/Documents/DataSets/rt-polaritydata/rt-polaritydata/rt-polarity.pos'
with open(positiveReviewsFileName,'r') as f:     positiveReviews = f.readlines()
    
negativeReviewsFileName =  'C:/Users/Sheik/Documents/DataSets/rt-polaritydata/rt-polaritydata/rt-polarity.neg'
with open(negativeReviewsFileName,'r') as f:     negativeReviews = f.readlines()

# Split data

In [4]:
testTrainingSplitIndex = 2500

testNegativeReviews = negativeReviews[testTrainingSplitIndex+1:]
testPositiveReviews = positiveReviews[testTrainingSplitIndex+1:]

trainingNegativeReviews = negativeReviews[:testTrainingSplitIndex]
trainingPositiveReviews = positiveReviews[:testTrainingSplitIndex]

In [6]:
len(testNegativeReviews)

2830

In [7]:
len(trainingPositiveReviews)

2500

# GetReviewSentiments

In [8]:
def getTestReviewSentiments(naiveBayesSentimentCalculator):
  testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeReviews]
  testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveReviews]
  labelToNum = {'positive':1,'negative':-1}
  numericNegResults = [labelToNum[x] for x in testNegResults]
  numericPosResults = [labelToNum[x] for x in testPosResults]
  return {'results-on-positive':numericPosResults, 'results-on-negative':numericNegResults}

In [9]:
def getTrainingData():
  negTaggedTrainingReviewList = [{'review':oneReview.split(),'label':'negative'} for oneReview in trainingNegativeReviews] 
  posTaggedTrainingReviewList = [{'review':oneReview.split(),'label':'positive'} for oneReview in trainingPositiveReviews] 
  fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList,posTaggedTrainingReviewList] for item in sublist]
  trainingData = [(review['review'],review['label']) for review in fullTaggedTrainingData]
  return trainingData


In [11]:
def getVocabulary():
  positiveWordList = [word for line in trainingPositiveReviews for word in line.split()]
  negativeWordList = [word for line in trainingNegativeReviews for word in line.split()]
  allWordList = [item for sublist in [positiveWordList,negativeWordList] for item in sublist]
  allWordSet = list(set(allWordList))
  vocabulary = allWordSet
  return vocabulary

# Extract Features from Review

In [15]:
def extract_features(review):
  review_words=set(review)
  features={}
  for word in vocabulary:
      features[word]=(word in review_words)
  return features

# Train Naive Bayes Classifier

In [20]:
def getTrainedNaiveBayesClassifier(extract_features, trainingData):
  trainingFeatures=nltk.classify.apply_features(extract_features, trainingData)
  trainedNBClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)
  return trainedNBClassifier

In [21]:
vocabulary = getVocabulary()
trainingData = getTrainingData()
trainedNBClassifier = getTrainedNaiveBayesClassifier(extract_features,trainingData)

In [22]:
trainedNBClassifier

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x24854227240>

In [24]:
def naiveBayesSentimentCalculator(review):
  problemInstance = review.split()
  problemFeatures = extract_features(problemInstance)
  return trainedNBClassifier.classify(problemFeatures)

# Test Few Reviews and Classify Test dataset

In [26]:
naiveBayesSentimentCalculator("What an awesome movie")

'positive'

In [29]:
def runDiagnostics(reviewResult):
  positiveReviewsResult = reviewResult['results-on-positive']
  negativeReviewsResult = reviewResult['results-on-negative']
  numTruePositive = sum(x > 0 for x in positiveReviewsResult)
  numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
  pctTruePositive = float(numTruePositive)/len(positiveReviewsResult)
  pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult)  
  totalAccurate = numTruePositive + numTrueNegative
  total = len(positiveReviewsResult) + len(negativeReviewsResult)
  print ("Accuracy on positive reviews = " +"%.2f" % (pctTruePositive*100) + "%")
  print ("Accurance on negative reviews = " +"%.2f" % (pctTrueNegative*100) + "%")
  print ("Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%")

In [30]:
runDiagnostics(getTestReviewSentiments(naiveBayesSentimentCalculator))

Accuracy on positive reviews = 73.39%
Accurance on negative reviews = 77.07%
Overall accuracy = 75.23%
