In [55]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [15]:
reviews_train=[]
for line in open('/home/subhash/aclImdb/movie_data/full_train.txt','r'):
    reviews_train.append(line)

In [16]:
reviews_test=[]
for line in open('/home/subhash/aclImdb/movie_data/full_test.txt','r'):
    reviews_test.append(line)

# Cleaning data

In [17]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

In [18]:
def preprcess_reviews(reviews):
    reviews=[REPLACE_NO_SPACE.sub("",line.lower()) for line in reviews]
    reviews=[REPLACE_WITH_SPACE.sub(" ",line.lower()) for line in reviews]
    return reviews

In [19]:
reviews_train_clean=preprcess_reviews(reviews_train)
reviews_test_clean=preprcess_reviews(reviews_test)

In [20]:
cv=CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_val=cv.transform(reviews_test_clean)

In [21]:
target = [1 if i<12500 else 0 for i in range(25000)]

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,target,test_size=0.25)

In [23]:
for c in [0.01,0.05,0.25,0.5,1]:
    logmodel = LogisticRegression(C=c)
    logmodel.fit(X_train,y_train)
    print("Accuracy for c = %s : %s" %(c,accuracy_score(y_test,logmodel.predict(X_test))))

Accuracy for c = 0.01 : 0.86992
Accuracy for c = 0.05 : 0.88
Accuracy for c = 0.25 : 0.87904
Accuracy for c = 0.5 : 0.87584
Accuracy for c = 1 : 0.87248


In [24]:
final_model=LogisticRegression(C=0.05)
final_model.fit(X,target)

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
X.shape

(25000, 92715)

In [26]:
print("Final Accuracy = %s " %accuracy_score(target,final_model.predict(X_val)))

Final Accuracy = 0.88152 


In [44]:
english_stop_words=stopwords.words('english')
print(reviews_train_clean[0])

bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt



# Remove Stopwords

In [63]:
def remove_stop_words(corpus):
    removed_stop_words=[]
    for review in corpus:
        removed_stop_words.append(' '.join([word for word in review.split() if word not in english_stop_words]))
    return removed_stop_words

In [64]:
reviews_train_clean = remove_stop_words(reviews_train_clean)
reviews_test_clean = remove_stop_words(reviews_test_clean)

# Normalization

### Stemming

In [65]:
def get_stemmed_text(corpus):
    stemmer=PorterStemmer()
    return[' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

In [66]:
reviews_train_clean=get_stemmed_text(reviews_train_clean)
reviews_test_clean=get_stemmed_text(reviews_test_clean)

In [67]:
def get_lemmatized_text(corpus):
    lemmatizer=WordNetLemmatizer()
    return[' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

### Lemmatization

In [68]:
reviews_train_clean=get_lemmatized_text(reviews_train_clean)

In [80]:
ngram_vectorizer=CountVectorizer(binary=True,ngram_range=(1,2))
ngram_vectorizer.fit(reviews_train_clean)
X=ngram_vectorizer.transform(reviews_train_clean)
X_val=ngram_vectorizer.transform(reviews_test_clean)

# Training

In [81]:
X_train,X_test,y_train,y_test=train_test_split(X,target,test_size=0.25)

In [82]:
for c in [0.01,0.05,0.25,0.5,1.0]:
    logmodel=LogisticRegression(C=c)
    logmodel.fit(X_train,y_train)
    print("Accuracy for c = %s : %s" %(c,accuracy_score(y_test,(logmodel.predict(X_test)))))

Accuracy for c = 0.01 : 0.87504
Accuracy for c = 0.05 : 0.88384
Accuracy for c = 0.25 : 0.88448
Accuracy for c = 0.5 : 0.88464
Accuracy for c = 1.0 : 0.88432


In [83]:
final_ngram=LogisticRegression(C=0.5)
final_ngram.fit(X,target)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [84]:
print("Final Accuracy = %s" %(accuracy_score(target,final_ngram.predict(X_val))))

Final Accuracy = 0.88536
