# 1. Loading the Dataset

In [1]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df.head(10)

Unnamed: 0,text,sentiment
0,For a movie that gets no respect there sure ar...,0
1,Bizarre horror movie filled with famous faces ...,0
2,"A solid, if unremarkable film. Matthau, as Ein...",0
3,It's a strange feeling to sit alone in a theat...,0
4,"You probably all already know this by now, but...",0
5,I saw the movie with two grown children. Altho...,0
6,You're using the IMDb. You've given some heft...,0
7,This was a good film with a powerful message o...,0
8,"Made after QUARTET was, TRIO continued the qua...",0
9,"For a mature man, to admit that he shed a tear...",0


In [2]:
df['text'][200]

'Wow, I forgot how great this movie was until I stumbled upon it while looking through the garage. It\'s a kind of strange combination of a bio of Michael Jackson, a collection of musical vignettes, and a story about a super hero fighting to save some little kids. The vignettes are good (especially Speed Demon), but the best part of this movie is the super hero segment, in which Michael Jackson turns into a car, a robot, and finally a spaceship (and it\'s just as weird as it sounds). Joe Pesci is hilarious, and has enough cool imagery and great music to entertain throughout!  The real gem however is the incredible "Smooth Criminal" video, which makes the movie worth owning for that part alone!'

In [3]:
df['sentiment'][200]

0

In [4]:
df['sentiment'][4000]

0

In [5]:
df['sentiment'][17000]

1

# 2. Transforming documents into Feature Vectors

In [21]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count =  CountVectorizer()
docs = np.array(['Im not looking for somebody',
                 'With some superhuman gifts',
                 'Some superhero',
                 'Some fairytale bliss',
                 'Just something I can turn to',
                 'Somebody I can kiss',
                 'I want something just like this'])
bag = count.fit_transform(docs)

In [22]:
print(count.vocabulary_)

{'im': 5, 'not': 10, 'looking': 9, 'for': 3, 'somebody': 12, 'with': 20, 'some': 11, 'superhuman': 15, 'gifts': 4, 'superhero': 14, 'fairytale': 2, 'bliss': 0, 'just': 6, 'something': 13, 'can': 1, 'turn': 18, 'to': 17, 'kiss': 7, 'want': 19, 'like': 8, 'this': 16}


In [23]:
print(bag.toarray())

[[0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0]]


# 3. Word Relevancy using term-frequency(tf) and inverse document-frequency(idf)

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
np.set_printoptions(precision =2)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.   0.   0.46 0.   0.46 0.   0.   0.   0.46 0.46 0.   0.38 0.
  0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.53 0.   0.   0.   0.   0.   0.   0.38 0.   0.
  0.   0.53 0.   0.   0.   0.   0.53]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.58 0.   0.
  0.82 0.   0.   0.   0.   0.   0.  ]
 [0.63 0.   0.63 0.   0.   0.   0.   0.   0.   0.   0.   0.45 0.   0.
  0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.41 0.   0.   0.   0.   0.41 0.   0.   0.   0.   0.   0.   0.41
  0.   0.   0.   0.5  0.5  0.   0.  ]
 [0.   0.54 0.   0.   0.   0.   0.   0.65 0.   0.   0.   0.   0.54 0.
  0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.4  0.   0.48 0.   0.   0.   0.   0.4
  0.   0.   0.48 0.   0.   0.48 0.  ]]


# 4. Data Preparation

In [28]:
df.loc[0,'text'][-50:]

'h for Alan "The Skipper" Hale jr. as a police Sgt.'

In [31]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('[\W]+', ' ',text.lower()) +\
    ' '.join(emoticons).replace('-','')
    return text

In [32]:
preprocessor(df.loc[0,'text'][-50:])

'h for alan the skipper hale jr as a police sgt '

In [33]:
preprocessor("</a>This :) is a :( test :-)! ")

'this is a test :) :( :)'

In [34]:
df['text'] = df['text'].apply(preprocessor)

In [35]:
df['text'][0]

'for a movie that gets no respect there sure are a lot of memorable quotes listed for this gem imagine a movie where joe piscopo is actually funny maureen stapleton is a scene stealer the moroni character is an absolute scream watch for alan the skipper hale jr as a police sgt '

In [36]:
df['text'][3000]

'one of the great mysteries of life suffered from daily is why nice girls so often are more interested in the jerks and heels than in the nice guys worse when the nice guys even want to marry those girls the girls still prefer the jerks and heels even after the jerks and heels have shown their contempt have shown they re just interested in using the girls stu erwin is the nice guy who continues to be nice after being lied to and cheated and even after losing the girl completely clark gable is the jerk and he is perfect in the role rather a sad note to his fans jean harlow comes across as a more slender mae west even sounding like la west in some of her cynical throwaway lines somewhat puzzling is that so many of the other characters intended to be bad guys i mean heck they re locked up so they must be are so obviously nice people in fact there are lots of nice people here people who in a lesser film or story would be snarling and back stabbing but here go out of their way to help someo

#  5.Tokenization of Documents

In [37]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [38]:
def tokenizer(text):
    return text.split()

In [42]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [40]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [43]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [44]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shrutika/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [45]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes the running and thus he runs a lot')[-10:]if w not in stop]


['runner', 'like', 'run', 'thu', 'run', 'lot']

# 6.Transform Text data into TF-IDF vectors

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = None,
                        lowercase = False,
                        preprocessor = None,
                        tokenizer = tokenizer_porter,
                        use_idf = True,
                        norm = 'l2',
                        smooth_idf = True)
y = df.sentiment.values
X = tfidf.fit_transform(df.text)

# 7. Document Classification using Logistic Regression 

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.5, shuffle = True, random_state = 1)

In [64]:
import pickle
from sklearn.linear_model import LogisticRegressionCV

classifier = LogisticRegressionCV(cv=5, 
                                  scoring='accuracy',
                                  random_state=0,
                                  n_jobs=-1,
                                  verbose=3,
                                  max_iter=300).fit(X_train,y_train)
saved_model = open('saved_model.sav', 'wb')
pickle.dump(classifier,saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.1min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.4min finished


In [62]:
df['sentiment']

0        0
1        0
2        0
3        0
4        0
        ..
24995    1
24996    1
24997    1
24998    1
24999    1
Name: sentiment, Length: 25000, dtype: int64

# 8. Model Evaluation 

In [65]:
filename = 'saved_model.sav'
saved_classifier = pickle.load(open(filename, 'rb'))

In [66]:
saved_classifier.score(X_test,y_test)

0.88664