In [1]:
import pandas as pd
import numpy as np

In [2]:
dfx=pd.read_csv("./Train.csv")
df_test=pd.read_csv("./Test.csv")

In [3]:
dfx.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
df_test.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [5]:
print(dfx.shape)
print(df_test.shape)

(40000, 2)
(10000, 1)


In [6]:
X_train=dfx.values[:,0]
Y_train=dfx.values[:,1]
X_test=df_test.values[:,0]

In [7]:
doc=["This is <h1>really a bad movie.I am sure nobody would<body> have appreaciated it",
     "<html>This is a good player<br> and had won<h1> so many titles"]

In [8]:
import re   # used for regular expression or you can use nltk RegexpTokenizer
from bs4 import BeautifulSoup as bs     # used to remove HTML tags

def clean(text):
    no_html = bs(text).get_text()
    clean = re.sub("[^a-z\s]+", " ", no_html, flags=re.IGNORECASE)
    return re.sub("(\s+)", " ", clean)

In [9]:
clean(doc[0])

'This is really a bad movie I am sure nobody would have appreaciated it'

In [10]:
bs(doc[0]).get_text()

'This is really a bad movie.I am sure nobody would have appreaciated it'

In [11]:
re.findall("[^a-z\s]+", doc[0])

['T', '<', '1>', '.I', '<', '>']

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def tokenize(text):
    clean_ = clean(text).lower()
    stopwords_en = stopwords.words("english")
    stemmed_tokens = [ps.stem(token) for token in clean_.split()]
    return [w for w in stemmed_tokens if not w in stopwords_en]

In [13]:
tokenize(doc[0])

['thi', 'realli', 'bad', 'movi', 'sure', 'nobodi', 'would', 'appreaci']

In [14]:
classes=np.unique(Y_train)
def group_by_class(X,y):
    data = dict()
    for c in classes:
        data[c] = X[np.where(y == c)]   #X[y==c] provided X,y,c are np array
    return data

In [15]:
cs=group_by_class(X_train,Y_train)
print(len(cs['pos']),len(cs['neg']))

20011 19989


In [16]:
for c, data in cs.items():
    print(len(data))

19989
20011


In [17]:
from collections import Counter
import math
from collections import defaultdict
def fit(X, y):
    n_class_items = {}
    log_class_priors = {}
    word_counts = {}
    vocab = set()

    n = len(X)
    grouped_data = group_by_class(X, y)
    for c, data in grouped_data.items():
        
        n_class_items[c] = len(data)
        log_class_priors[c] = math.log(n_class_items[c] / n)
        word_counts[c] = defaultdict(lambda: 0)

        for text in data:
            counts = Counter(tokenize(text))
            for word, count in counts.items():
                vocab.add(word)
                word_counts[c][word] += count
    return(n_class_items,
            log_class_priors,
            word_counts,
            vocab)    

In [18]:
c,log,word_counts,vocab=fit(X_train,Y_train)

In [19]:
def laplace_smoothing(word, text_class):
    
    num = word_counts[text_class][word] + 1
    denom = c[text_class] + len(vocab)
    return math.log(num / denom)

def predict(X):
    result = []
    for text in X:

        class_scores = {t:log[t] for t in classes}
        words = set(tokenize(text))

        for word in words:
            
            if word not in vocab: continue

            for cl in classes:

                log_w_given_c = laplace_smoothing(word, cl)
                class_scores[cl] += log_w_given_c

        result.append(max(class_scores, key=class_scores.get))

    return result

In [20]:
prediction=predict(X_test)

In [21]:
Id=np.arange(0,X_test.shape[0])
dataframe=pd.DataFrame({"Id":Id,"label":prediction})
dataframe.to_csv("scratch.csv",index=False)

## 85%accuracy Not Bad(Without n-grams)