In [0]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [0]:
np.random.seed(500)  #random seed 

In [0]:
Corpus_csv = pd.read_csv("imdb_tr.csv",encoding='latin-1') #read csv file

In [0]:
Corpus_csv['text'].dropna(inplace=True)# All text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus_csv['text'] = [entry.lower() for entry in Corpus_csv['text']]# broken into words
Corpus_csv['text']= [word_tokenize(entry) for entry in Corpus_csv['text']]# Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tagmap = defaultdict(lambda : wn.NOUN) #default  dictionary
tagmap['J'] = wn.ADJ
tagmap['V'] = wn.VERB
tagmap['R'] = wn.ADV
for index,entry in enumerate(Corpus_csv['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer() converting to its base form
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tagmap[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus_csv.loc[index,'text_final'] = str(Final_words)

In [0]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus_csv['text_final'],Corpus_csv['polarity'],test_size=0.3)

In [0]:
Encoder_le = LabelEncoder() #convert to machine readable form
Train_Y = Encoder_le.fit_transform(Train_Y) 
Test_Y = Encoder_le.fit_transform(Test_Y)

In [0]:
Tfidf_vect = TfidfVectorizer(max_features=5000) #to convert text into features
Tfidf_vect.fit(Corpus_csv['text_final']) 
Train_X_Tfidf = Tfidf_vect.transform(Train_X) #to transform and replace it with new 
Test_X_Tfidf = Tfidf_vect.transform(Test_X)  #to transform and replace it with new 

In [0]:
print(Tfidf_vect.vocabulary_) #print new features



In [0]:
print(Train_X_Tfidf) #print occurence

  (0, 4983)	0.0688707175636
  (0, 4965)	0.0536143018215
  (0, 4961)	0.0451777046432
  (0, 4954)	0.0338393780946
  (0, 4939)	0.0677421285491
  (0, 4938)	0.123819925129
  (0, 4928)	0.0747664497376
  (0, 4923)	0.0580398233481
  (0, 4897)	0.0883322164721
  (0, 4879)	0.0537354293088
  (0, 4822)	0.0538190922559
  (0, 4739)	0.0722635464183
  (0, 4738)	0.0561310261182
  (0, 4737)	0.056440178731
  (0, 4684)	0.0878176988072
  (0, 4651)	0.034262260861
  (0, 4608)	0.0737632198552
  (0, 4566)	0.122775709462
  (0, 4509)	0.0671495652615
  (0, 4508)	0.047391728512
  (0, 4501)	0.0382373531128
  (0, 4493)	0.0314654345785
  (0, 4469)	0.0720131132586
  (0, 4422)	0.047859288594
  (0, 4312)	0.0786764684533
  :	:
  (17499, 475)	0.0370312691311
  (17499, 440)	0.0488375375833
  (17499, 437)	0.0349771503792
  (17499, 431)	0.0472179832715
  (17499, 430)	0.0317707068144
  (17499, 397)	0.0295432300952
  (17499, 385)	0.068295893506
  (17499, 381)	0.0494355759187
  (17499, 333)	0.0160559525686
  (17499, 316)	0.02412