In [None]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train_df = pd.read_csv('train.tsv',header=0,delimiter='\t',quoting=3)

train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [None]:
train_df['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [None]:
train_df['Phrase'][1]

'A series of escapades demonstrating the adage that what is good for the goose'

In [None]:
test_df = pd.read_csv('test.tsv',header=0,delimiter='\t',quoting=3)

test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [None]:
train_df['Phrase'].head(20)

0     A series of escapades demonstrating the adage ...
1     A series of escapades demonstrating the adage ...
2                                              A series
3                                                     A
4                                                series
5     of escapades demonstrating the adage that what...
6                                                    of
7     escapades demonstrating the adage that what is...
8                                             escapades
9     demonstrating the adage that what is good for ...
10                              demonstrating the adage
11                                        demonstrating
12                                            the adage
13                                                  the
14                                                adage
15                      that what is good for the goose
16                                                 that
17                           what is good for th

In [None]:
pd.options.display.max_colwidth = 500

In [None]:
train_df['Phrase'].head(50)

0     A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .
1                                                                                                                    A series of escapades demonstrating the adage that what is good for the goose
2                                                                                                                                                                                         A series
3                                                                                                                                                                                                A
4                                                                                                                                                                                           series
5                        

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
len(train_df)

156060

In [None]:
len(test_df)

66292

In [None]:
def clean_words(raw_words):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    text = BeautifulSoup(raw_words).get_text()
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [None]:
clean_words(train_df['Phrase'][0])

'series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story'

In [None]:
train_df['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [None]:
length = len(train_df['Phrase'])

length

156060

In [None]:
cleaned_phrases = []

for i in range(0, length):
  cleaned_phrases.append(clean_words(train_df['Phrase'][i]))

  ' Beautiful Soup.' % markup)


In [None]:
cleaned_phrases

['series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story',
 'series escapades demonstrating adage good goose',
 'series',
 '',
 'series',
 'escapades demonstrating adage good goose',
 '',
 'escapades demonstrating adage good goose',
 'escapades',
 'demonstrating adage good goose',
 'demonstrating adage',
 'demonstrating',
 'adage',
 '',
 'adage',
 'good goose',
 '',
 'good goose',
 '',
 'good goose',
 '',
 'good goose',
 'good',
 'goose',
 '',
 'goose',
 'goose',
 'also good gander occasionally amuses none amounts much story',
 'also good gander occasionally amuses none amounts much story',
 'also',
 'also',
 'good gander occasionally amuses none amounts much story',
 'gander occasionally amuses none amounts much story',
 'gander occasionally amuses none amounts much story',
 'gander',
 'gander',
 'gander',
 '',
 'occasionally amuses none amounts much story',
 '',
 '',
 '',
 '',
 'occasionally amuses none amounts much story',
 'occa

In [None]:
len(cleaned_phrases)

156060

In [None]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_feat = vectorizer.fit_transform(cleaned_phrases)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_feat_array = train_feat.toarray()

In [None]:
train_feat

<156060x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 508653 stored elements in Compressed Sparse Row format>

In [None]:
train_feat_array.shape

(156060, 5000)

In [None]:
vocab = vectorizer.get_feature_names()
len(vocab)

5000

In [None]:
dist = np.sum(train_feat_array, axis=0)

for tag, count in zip(vocab, dist):
  print(count, tag)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
130 ability
176 able
34 absolute
82 absolutely
82 absorbing
75 abstract
64 absurd
50 absurdity
28 abundant
56 abuse
57 academy
46 accent
53 accents
49 accept
57 acceptable
32 access
67 accessible
28 accident
65 accomplished
24 accomplishments
42 account
43 accurate
28 accurately
26 acerbic
41 ache
29 achieve
72 achievement
69 achieves
41 achingly
25 acknowledges
156 across
241 act
179 acted
660 acting
1170 action
41 actions
286 actor
586 actors
159 actress
70 actresses
84 acts
55 actual
423 actually
38 ad
83 adam
54 adams
160 adaptation
35 adapted
106 add
34 added
66 addition
49 adds
38 adequate
26 adequately
85 admirable
70 admire
25 admission
56 admit
45 admittedly
31 adolescence
85 adolescent
26 adrenaline
101 adult
194 adults
41 advantage
245 adventure
51 adventures
48 adventurous
34 advice
92 affair
24 affected
70 affecting
104 affection
102 affirming
26 affleck
65 afraid
76 african
26 afterlife
37 afternoon
356 age


In [None]:
train_df['Sentiment'].fillna(0,inplace = True)
train_df['Sentiment'].isna().sum()

0

In [None]:
train_feat_array.shape

(156060, 5000)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100)

mod = rf.fit(train_feat,train_df['Sentiment'])

In [None]:
length = len(test_df['Phrase'])

cleaned_test_phrases = []

for i in range(0, length):
  cleaned_test_phrases.append(clean_words(test_df['Phrase'][i]))

  ' Beautiful Soup.' % markup)


In [None]:
cleaned_test_phrases

['intermittently pleasing mostly routine effort',
 'intermittently pleasing mostly routine effort',
 '',
 'intermittently pleasing mostly routine effort',
 'intermittently pleasing mostly routine',
 'intermittently pleasing',
 'intermittently pleasing',
 'intermittently',
 'pleasing',
 '',
 'mostly routine',
 'mostly',
 'routine',
 'effort',
 '',
 'kidman really thing worth watching birthday girl film stage trained jez butterworth lrb mojo rrb serves yet another example sad decline british comedies post full monty world',
 'kidman',
 'really thing worth watching birthday girl film stage trained jez butterworth lrb mojo rrb serves yet another example sad decline british comedies post full monty world',
 'really thing worth watching birthday girl film stage trained jez butterworth lrb mojo rrb serves yet another example sad decline british comedies post full monty world',
 'really',
 '',
 'really',
 'thing worth watching birthday girl film stage trained jez butterworth lrb mojo rrb serve

In [None]:
test_feat = vectorizer.transform(cleaned_test_phrases)
test_feat_array = test_feat.toarray()

# Use the random forest to make sentiment label predictions
result = rf.predict(test_feat)

NameError: ignored

In [None]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"PhraseId":test_df["PhraseId"], "Sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

In [None]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import normalize

In [None]:
train_feat_norm = normalize(train_feat)
test_feat_norm = normalize(test_feat)

In [None]:
train_feat_norm

<156060x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 508653 stored elements in Compressed Sparse Row format>

In [None]:
svm_mod = LinearSVC(random_state=0)

In [None]:
svm_mod.fit(train_feat_norm,train_df['Sentiment'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

In [None]:
svm_pred = svm_mod.predict(test_feat_norm)

In [None]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"PhraseId":test_df["PhraseId"], "Sentiment":svm_pred} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_SVM_model.csv", index=False, quoting=3 )