In [None]:
!wget 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' -P '/content/drive/My Drive/Datasets'

--2020-07-30 07:32:38--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘/content/drive/My Drive/Datasets/aclImdb_v1.tar.gz’


2020-07-30 07:32:43 (16.6 MB/s) - ‘/content/drive/My Drive/Datasets/aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
import tarfile
fname = '/content/drive/My Drive/Datasets/aclImdb_v1.tar.gz'
with tarfile.open(fname) as tar:
    tar.extractall(path = '/content/drive/My Drive/Datasets')
    print('extracted')

In [1]:
base = "/content/drive/My Drive/Datasets/aclImdb/"
train_pos_path = base + "train/pos"
train_neg_path = base + "train/neg"
test_pos_path  = base + "test/pos"
test_neg_path  = base + "test/neg"
train_file = base + "train.csv"
test_file  = base + "test.csv"

In [None]:
import os
import csv

def create_data_file(pos_path , neg_path , output):
  with open(output, 'w') as outfile:
      csvout = csv.writer(outfile)
      csvout.writerow(['sentiment', 'review'])

      files = os.listdir(neg_path)

      for filename in files:
          with open(neg_path + '/' + filename) as review_file:
              csvout.writerow([0, review_file.read()])

      files = os.listdir(pos_path)

      for filename in files:
          with open(pos_path + '/' + filename) as review_file:
              csvout.writerow([1, review_file.read()])

create_data_file(train_pos_path , train_neg_path , train_file)
print("train.csv file created")
create_data_file(test_pos_path , test_neg_path , test_file)
print("test.csv file created")

train.csv file created
test.csv file created


In [2]:
def evaluate(name,actual_label , pred_label):
  score = accuracy_score(actual_label , pred_label)
  print(name + " accuracy :", score*100)

In [3]:
import pandas as pd
import numpy as np

In [4]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [5]:
train_df[:4]

Unnamed: 0,sentiment,review
0,0,It wasn't good. The characters were underdevel...
1,0,I wanted to see the movie because of an articl...
2,0,For me this is a story that starts with some f...
3,0,I Am Curious is really two films in one - half...


In [6]:
test_df[:4]

Unnamed: 0,sentiment,review
0,0,Two page boys working at a radio network go fr...
1,0,Rebecca De Mornay can be a fascinating beautif...
2,0,This film is slow. This film is cheap. This fi...
3,0,I saw this movie on the strength of the single...


In [7]:
train_df = train_df.sample(frac=1, random_state = 42).reset_index(drop=True)   #shuffle the data_frame rows
test_df  = test_df.sample(frac=1 , random_state = 42).reset_index(drop=True)

In [8]:
print(train_df.shape , test_df.shape)
train_df[:4]

(25000, 2) (25000, 2)


Unnamed: 0,sentiment,review
0,0,I almost burst into tears watching this movie....
1,1,"Well, it definitely is unlike anything else di..."
2,0,"This piece of crap, since I can't call it a mo..."
3,1,More a snapshot of the most popular pinup of a...


In [9]:
test_df[:4]

Unnamed: 0,sentiment,review
0,0,Asia Argento has never done a film (so far as ...
1,1,Meatballs is a classic comedy with so many lau...
2,0,Harry Knowles has a quote right on the front c...
3,1,"Before you watch this movie - clean your ears,..."


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [150]:
tfidf_model = TfidfVectorizer(strip_accents= 'ascii' ,analyzer = 'word',
                              stop_words= 'english' ,lowercase = True ,
                              max_features=20000 ,sublinear_tf = True ,norm = 'l2')
tfidf_train_reviews  = tfidf_model.fit_transform(train_df['review'])
tfidf_test_reviews = tfidf_model.transform(test_df['review'])

In [151]:
tfidf_train_reviews.shape

(25000, 20000)

In [152]:
tfidf_train_reviews

<25000x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 2120077 stored elements in Compressed Sparse Row format>

In [93]:
from sklearn.model_selection import GridSearchCV
LR = LogisticRegression(random_state=42)
params = {'C':[1,2,3,5,10,50,100] , 'penalty' : ['l2'] , 'solver' : ['newton-cg' , 'lbfgs' , 'liblinear'] , 'max_iter' : [200,300,400,500,700]}
model = GridSearchCV(LR, params , n_jobs= -1 ,verbose = 10,cv = 3)
model = model.fit(tfidf_train_reviews , train_df['sentiment'])
model.best_params_

Fitting 3 folds for each of 105 candidates, totalling 315 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  1

{'C': 2, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}

In [94]:
LR_model = LogisticRegression(C=2,random_state=42 ,max_iter = 200 , penalty= 'l2') 
LR_model = LR_model.fit(tfidf_train_reviews , train_df['sentiment'])

In [95]:
test_pred = LR_model.predict(tfidf_test_reviews)
train_pred = LR_model.predict(tfidf_train_reviews)
evaluate('train',train_df["sentiment"],train_pred)
evaluate('test',test_df["sentiment"],test_pred)

train accuracy : 95.07600000000001
test accuracy : 88.52799999999999


In [103]:
!pip install unidecode



In [17]:
import re
import unidecode
def preprocessor(text):
  m_text = text.strip()
  m_text = unidecode.unidecode(m_text)
  m_text = re.sub(r'[^a-zA-Z_\s+0-9]','',m_text)
  return m_text

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')
def tokenizer(text):
    doc = nlp(text)
    doc = [token for token in doc if not (token.is_punct or token.is_stop)]
    doc = [token.lemma_ if token.pos_ is not 'PRON' else token.text for token in doc]
    return doc

In [20]:
tfidf_model = TfidfVectorizer(preprocessor= preprocessor,tokenizer=tokenizer
                             ,strip_accents= None ,analyzer = 'word',
                              stop_words= None ,lowercase = True
                              ,max_features = 20000 , sublinear_tf = True
                              )
tfidf_train_reviews  = tfidf_model.fit_transform(train_df['review'])
tfidf_test_reviews = tfidf_model.transform(test_df['review'])

In [21]:
import pickle
with open(base + 'train_tfidf_vec' , 'wb') as tfidfvec:
  pickle.dump(tfidf_train_reviews,tfidfvec)

with open(base + 'test_tfidf_vec' , 'wb') as tfidfvec:
  pickle.dump(tfidf_test_reviews , tfidfvec)

In [25]:
tfidf_train_reviews =  pickle.load(open(base + 'train_tfidf_vec' , 'rb'))
tfidf_test_reviews =  pickle.load(open(base + 'test_tfidf_vec' , 'rb'))

In [41]:
model = LogisticRegression(C=2, random_state=42, max_iter = 100)
model.fit(tfidf_train_reviews , train_df['sentiment'])

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
test_pred = model.predict(tfidf_test_reviews)
train_pred = model.predict(tfidf_train_reviews)
evaluate('train',train_df["sentiment"],train_pred)
evaluate('test',test_df["sentiment"],test_pred)

train accuracy : 94.732
test accuracy : 88.012


In [184]:
from xgboost.sklearn import XGBClassifier

model = XGBClassifier(learning_rate= .5, 
                  #max_depth= 1,
                  #min_child_weight= 16,
                  n_estimators= 500,
                  verbosity =1
                  #booster = 'gbtree'
                  )

model = model.fit(tfidf_train_reviews , train_df['sentiment'])

xg_model = xgboost()

In [185]:
test_pred = xg_model.predict(tfidf_test_reviews)
train_pred = xg_model.predict(tfidf_train_reviews)
evaluate('train',train_df["sentiment"],train_pred)
evaluate('test',test_df["sentiment"],test_pred)

train accuracy : 97.49600000000001
test accuracy : 85.47200000000001
