In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud, STOPWORDS
import spacy
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
df = pd.read_csv('../Datasets/trainData/all-data.csv', encoding = "ISO-8859-1" , header=None , names=["Sentiment", "Reviews"]) 
df.head()

Unnamed: 0,Sentiment,Reviews
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [3]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
df['Reviews'] = df['Reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head()

Unnamed: 0,Sentiment,Reviews
0,neutral,"according to gran , the company has no plans t..."
1,neutral,technopolis plans to develop in stages an area...
2,negative,the international electronic industry company ...
3,positive,with the new production plant the company woul...
4,positive,according to the company 's updated strategy f...


In [5]:
df['Reviews'] = df['Reviews'].str.replace('[^\w\s]','')
df.head()

Unnamed: 0,Sentiment,Reviews
0,neutral,according to gran the company has no plans to...
1,neutral,technopolis plans to develop in stages an area...
2,negative,the international electronic industry company ...
3,positive,with the new production plant the company woul...
4,positive,according to the company s updated strategy fo...


In [6]:
stop = stopwords.words('english')
df['Reviews'] = df['Reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

Unnamed: 0,Sentiment,Reviews
0,neutral,according gran company plans move production r...
1,neutral,technopolis plans develop stages area less 100...
2,negative,international electronic industry company elco...
3,positive,new production plant company would increase ca...
4,positive,according company updated strategy years 20092...


In [7]:
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])
df['Reviews']= df['Reviews'].apply(space)
df.head(20)

Unnamed: 0,Sentiment,Reviews
0,neutral,accord gran company plan move production russi...
1,neutral,technopoli plan develop stage area less 100000...
2,negative,international electronic industry company elco...
3,positive,new production plant company would increase ca...
4,positive,accord company update strategy year 20092012 b...
5,positive,finance aspocomp growth aspocomp aggressively ...
6,positive,last quarter 2010 componenta net sale double e...
7,positive,third quarter 2010 net sale increase 52 eur 20...
8,positive,operate profit rise eur 131 mn eur 87 mn corre...
9,positive,operate profit total eur 211 mn eur 186 mn 200...


In [8]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['Reviews'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts,df['Sentiment'] ,test_size=0.25, random_state=5)

In [10]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
#gnb.fit(data_train, target_train)

MultinomialNB()

In [11]:
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

67.99%
