In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [3]:
def getCleannedReview(review) :
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    # Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

In [4]:
def getStemmedDocument(reviews) :
    clean_document = []
    for review in reviews :
        cleaned_review = getCleannedReview(review)
        clean_document.append(cleaned_review)
        
    return clean_document

In [5]:
Dict = {'pos': 1, 'neg': 0}

In [6]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [7]:
train_data.columns

Index(['review', 'label'], dtype='object')

In [7]:
clean_train_data = getStemmedDocument(train_data['review'])

In [10]:
clean_train_data[0]

'matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take'

In [8]:
type(clean_train_data)

list

In [14]:
train_result = [Dict[i] for i in train_data['label']]

In [15]:
train_result = np.array(train_result)

In [37]:
type(train_result)

numpy.ndarray

In [38]:
train_result.shape

(40000,)

In [16]:
clean_test_data = getStemmedDocument(test_data['review'])

In [21]:
len(clean_test_data)

10000

# Vectorization

In [17]:
cv = CountVectorizer(ngram_range=(1,2))

In [18]:
xtrain_vec = cv.fit_transform(clean_train_data)

In [19]:
print(xtrain_vec)

  (0, 1248488)	1
  (0, 1029431)	1
  (0, 941511)	1
  (0, 339466)	1
  (0, 1263578)	1
  (0, 2099263)	1
  (0, 743520)	2
  (0, 353383)	1
  (0, 7162)	1
  (0, 2191716)	2
  (0, 1932922)	2
  (0, 1477767)	1
  (0, 320321)	1
  (0, 1194723)	1
  (0, 2072109)	1
  (0, 1819408)	1
  (0, 1423833)	1
  (0, 1752554)	1
  (0, 1213598)	1
  (0, 1973513)	1
  (0, 1248604)	1
  (0, 1029728)	1
  (0, 941565)	1
  (0, 339670)	1
  (0, 1263816)	1
  :	:
  (39999, 1813539)	1
  (39999, 1813544)	1
  (39999, 1551937)	1
  (39999, 912814)	1
  (39999, 2211394)	1
  (39999, 908913)	1
  (39999, 1446628)	1
  (39999, 1476321)	1
  (39999, 854818)	1
  (39999, 464350)	1
  (39999, 1762344)	1
  (39999, 1187352)	1
  (39999, 1325729)	1
  (39999, 1742460)	1
  (39999, 796784)	1
  (39999, 1863717)	1
  (39999, 967952)	1
  (39999, 1283096)	1
  (39999, 1510213)	1
  (39999, 1367643)	1
  (39999, 958793)	1
  (39999, 1707148)	1
  (39999, 803296)	1
  (39999, 464364)	1
  (39999, 1862858)	1


In [20]:
xtest_vec = cv.transform(clean_test_data)

# Multinomial Naive Bayes

In [21]:
mnb = MultinomialNB()

mnb.fit(xtrain_vec,train_result)

In [23]:
pred = mnb.predict(xtest_vec)

In [24]:
pred

array([0, 0, 0, ..., 1, 1, 0])

In [25]:
Dict = {1:'pos', 0: 'neg'}

In [28]:
final_pred = [Dict[i] for i in pred]

In [29]:
final_pred

['neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',


In [32]:
final_result = pd.DataFrame(final_pred)

In [33]:
final_result

Unnamed: 0,0
0,neg
1,neg
2,neg
3,pos
4,pos
...,...
9995,neg
9996,pos
9997,pos
9998,pos


In [34]:
final_result.columns = ['label']

In [35]:
final_result

Unnamed: 0,label
0,neg
1,neg
2,neg
3,pos
4,pos
...,...
9995,neg
9996,pos
9997,pos
9998,pos


In [38]:
final_result.to_csv('Output.csv',index=True,index_label=['Id'])