In [28]:
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd

#to avoid future warning
#import warnings
#warnings.simplefilter(action='ignore', category=FutureWarning)

In [29]:
vect = CountVectorizer()

In [30]:
messages = ["Hey hey hey lets go get lunch today..!",
            "Did you go home?..",
            "Hey!!! I need a favor"]

using fit method ,CountVectorizer() will learn what tokens are being used in our messages

In [31]:
vect.fit(messages)

CountVectorizer()

In [32]:
vect.get_feature_names()

['did',
 'favor',
 'get',
 'go',
 'hey',
 'home',
 'lets',
 'lunch',
 'need',
 'today',
 'you']

In [33]:
x_new = vect.fit_transform(messages).toarray()

In [34]:
x_new

array([[0, 0, 1, 1, 3, 0, 1, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]], dtype=int64)

In [35]:
#x_new = vect.fit_transform(messages).toarray()

In [36]:
vect.get_feature_names()

['did',
 'favor',
 'get',
 'go',
 'hey',
 'home',
 'lets',
 'lunch',
 'need',
 'today',
 'you']

- By using the get_feature_name() method ,we can see what features
  have been created from our messages.(or what tokens have been learned by CountVectorizer)

In [37]:
df = pd.DataFrame(x_new,columns = vect.get_feature_names())
print (df)

   did  favor  get  go  hey  home  lets  lunch  need  today  you
0    0      0    1   1    3     0     1      1     0      1    0
1    1      0    0   1    0     1     0      0     0      0    1
2    0      1    0   0    1     0     0      0     1      0    0


### Let's read text data from  a file and try to convert it into numeric form

In [38]:
df = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [39]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [40]:
df.shape

(1000, 2)

In [41]:
df.Liked.value_counts()  #to display no. of samples for each unique value

1    500
0    500
Name: Liked, dtype: int64

In [42]:
import re

In [43]:
def clean(x):
    
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #remove stop words
    #stemming
    #lemmitization
    #etc..
    s = re.sub('<.*?>',' ',x) #remove html tags
    
    s = re.sub('[^A-Za-z]',' ',s)  
    #to replace everything except A-Z or a-z with ' '(single space)
    
    s = re.sub('\s+',' ',s)       
    #to replace more than one space's with single space only
    
    s = s.strip()    
    #remove spaces from either from beginning or end of string
    
    return s.lower()    #return string in lower case
    
    

In [44]:
clean('hello python 123hi #@ <html>ok</html>')

'hello python hi ok'

In [45]:
df['Review'] = df.Review.apply(clean)

In [46]:
df.head()

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [47]:
X = df.Review.values
y = df.Liked.values

In [48]:
#from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\risha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#words = stopwords.words('english')

In [None]:
#print(words)

In [None]:
#if 'not' in words:
 #   words.remove('not')

In [52]:
#stop.append()

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2,random_state=12)

In [55]:
cv = CountVectorizer()

In [56]:
cv_train = cv.fit_transform(xtrain)  #for training data

cv_test = cv.transform(xtest)        #for testing data

In [57]:
#cv.get_stop_words()

In [58]:
#for i in list(cv.get_stop_words()):
#    print(i, end=',')

In [59]:
#if 'not' in list(cv.get_stop_words()):
 #   print('Yes it exists!')

In [60]:
cv.get_feature_names()

['about',
 'above',
 'absolute',
 'absolutely',
 'absolutley',
 'accident',
 'accommodations',
 'accomodate',
 'accordingly',
 'accountant',
 'ache',
 'acknowledged',
 'across',
 'actual',
 'actually',
 'added',
 'affordable',
 'after',
 'afternoon',
 'again',
 'ago',
 'ahead',
 'airline',
 'airport',
 'ala',
 'albondigas',
 'all',
 'allergy',
 'almonds',
 'almost',
 'alone',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'ambiance',
 'ambience',
 'amount',
 'ample',
 'an',
 'and',
 'andddd',
 'angry',
 'annoying',
 'another',
 'anticipated',
 'any',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'anyways',
 'apart',
 'apologize',
 'apology',
 'app',
 'apparently',
 'appealing',
 'appetite',
 'appetizer',
 'appetizers',
 'apple',
 'approval',
 'are',
 'area',
 'aren',
 'arepas',
 'around',
 'array',
 'arrived',
 'arrives',
 'arriving',
 'article',
 'as',
 'ask',
 'asked',
 'asking',
 'assure',
 'at',
 'ate',
 'atmosphere',
 'attack',
 'attention',
 'attentive',
 'attitudes',
 'a

In [61]:
cv_train.shape

(800, 1776)

In [62]:
len(cv.get_feature_names())

1776

In [63]:
from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score

In [64]:
log = LogisticRegression()

In [65]:
log.fit(cv_train,ytrain)

pred = log.predict(cv_test)

#score = accuracy_score(ytest,pred)
#score
score = log.score(cv_test,ytest)
score

0.78

In [66]:
pred

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1], dtype=int64)

In [67]:
test = ['food was 123@#$ good']

In [68]:
cleaned_data=[]

for i in test:
    t = clean(i)
    cleaned_data.append(t)

In [69]:
cleaned_data

['food was good']

In [70]:
t1 = cv.transform(cleaned_data)

In [71]:
t1.shape

(1, 1776)

In [72]:
log.predict(t1)

array([1], dtype=int64)

In [73]:
########################################### 2 Way : Importing a text file and performing extraction ##########################

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd

In [75]:
df =pd.read_csv('IMDB Dataset.csv')

In [76]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [77]:
df.iloc[4,0]

'Petter Mattei\'s "Love in the Time of Money" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler\'s play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the case wit

In [78]:
df.shape

(50000, 2)

In [79]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [80]:
import re

In [81]:
def clean(x):
    
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #remove stop words
    #stemming
    #lemmitization
    #etc..
    
    s = re.sub('<.*?>',' ',x) #remove html tags
    
    s = re.sub('[^A-Za-z]',' ',s)  
    #to replace everything except A-Z or a-z with ' '(single space)
    
    s = re.sub('\s+',' ',s)       
    #to replace more than one space's with single space only
    
    s = s.strip()    
    #remove spaces from either from beginning or end of string
    
    return s.lower()    #return string in lower case

In [82]:
df['review'] = df.review.apply(clean)

In [83]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive


In [84]:
df.iloc[0].review

'one of the other reviewers has mentioned that after watching just oz episode you ll be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldn t dare forget pretty p

In [85]:
X = df.review.values
y = df.sentiment.values

In [86]:
from sklearn.model_selection import train_test_split

In [87]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2,random_state=12)

In [88]:
vect = CountVectorizer()

In [None]:
#vect.get_feature_names()

In [None]:
cv_train = vect.fit_transform(xtrain)  #for training data

cv_test = vect.transform(xtest)        #for testing data

In [None]:
#cv.get_stop_words()

In [23]:
cv_train.shape

NameError: name 'cv_train' is not defined

In [None]:
#len(cv.get_feature_names())

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [25]:
log = LogisticRegression()

In [26]:
log.fit(cv_train,ytrain)

pred = log.predict(cv_test)

score = accuracy_score(ytest,pred)
score

NameError: name 'cv_train' is not defined

In [None]:
x =vect.fit_transform(df['review'])

In [27]:
x

NameError: name 'x' is not defined

In [None]:
#vect.get_feature_names()