In [1]:
import pandas as pd  
data=pd.read_csv('data.csv' , encoding='latin-1')
data.head()

Unnamed: 0,sentiment,text
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [2]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', ''))
    return text

In [3]:
data['text'] = data['text'].apply(preprocessor)

In [4]:
 def tokenizer(text):
         return text.split()

In [5]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
     return [porter.stem(word) for word in text.split()]


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sriya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [8]:
data

Unnamed: 0,sentiment,text
0,neutral,technopolis plans to develop in stages an area...
1,negative,the international electronic industry company ...
2,positive,with the new production plant the company woul...
3,positive,according to the company s updated strategy fo...
4,positive,financing of aspocomp s growth aspocomp is agg...
...,...,...
4840,negative,london marketwatch share prices ended lower in...
4841,neutral,rinkuskiai s beer sales fell by 6 5 per cent t...
4842,negative,operating profit fell to eur 35 4 mn from eur ...
4843,negative,net sales of the paper segment decreased to eu...


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], data['sentiment'], test_size=0.25, random_state=5)

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Tf-idf vectorizer with Logistic regression

In [11]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)], 'vect__stop_words': [stop, None],'vect__tokenizer': [tokenizer,tokenizer_porter],'clf__penalty': ['l1', 'l2'],'clf__C': [1.0, 10.0,100.0]}]
lr_tfidf = Pipeline([('vect', tfidf),('clf',LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, Y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   49.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [12]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x00000210AC8AFA68>} 


In [13]:
print('CV Accuracy: %.3f'% gs_lr_tfidf.best_score_)

CV Accuracy: 0.766


In [14]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, Y_test))

Test Accuracy: 0.769


# Naive Bayes Classifiers

# Multinomial NB

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(data['text'])

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['sentiment'], test_size=0.25, random_state=5)

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)

In [20]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

67.82%


# Multinomial NB with 2_grams

In [21]:
#from sklearn.feature_extraction import CountVectorizer
#from nltk.tokenize import RegexpTokenizer
#token = RegexpTokenizer(r'[A-Za-z0-9]+')
cv = CountVectorizer(stop_words='english', ngram_range = (2,2), tokenizer = token.tokenize)
text_counts = cv.fit_transform(data['text'])

#from sklearn.model_selection import train_test_split()
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['sentiment'],test_size=0.25, random_state=5)

#Defining the model-> we will use MultinomialNB

#Compiling the model -> We will import precompiled MNB from sklearn library
#from sklearn.naive_bayes import MultinomialNB 

#Fitting the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

#Evaulating the model
#form sklearn import metrics
accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

46.86%


# Multinomial NB with 3_grams 

In [22]:
cv = CountVectorizer(stop_words='english', ngram_range = (3,3), tokenizer = token.tokenize)
text_counts = cv.fit_transform(data['text'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['sentiment'],test_size=0.25, random_state=5)
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

38.45%


# Complement NB

In [32]:
cv = CountVectorizer(stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)
text_count = cv.fit_transform(data['text'])


X_train, X_test, Y_train, Y_test = train_test_split(text_count, data['sentiment'], test_size=0.25, random_state=2)


from sklearn.naive_bayes import ComplementNB


CNB = ComplementNB()
CNB.fit(X_train, Y_train)


accuracy_score = metrics.accuracy_score(CNB.predict(X_test),Y_test)

print(str('{:4.2f}'.format(accuracy_score*100))+'%')

67.82%


# Gaussian NB

In [33]:
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
GNB.fit(X_train.todense(), Y_train)
accuracy_score = metrics.accuracy_score(CNB.predict(X_test),Y_test)

print('GNB accuracy = ' + str('{:4.2f}'.format(accuracy_score*100))+'%')

GNB accuracy = 67.82%


# Bernoulli NB

In [25]:
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(X_train, Y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(X_test),Y_test)
print('BNB accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

BNB accuracy = 67.57%


# Using tf-idf vectorizer

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(data['text'])

#splitting the data in test and training
#from sklearn.model_selection() import train_test_split()
x_train, x_test, y_train, y_test = train_test_split(text_count_2, data['sentiment'],test_size=0.25,random_state=5)

#defining the model
#compilimg the model -> we are going to use already used models GNB, MNB, CNB, BNB
#fitting the model
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('accuracy_score_bnb = '+str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')

CNB.fit(x_train, y_train)
accuracy_score_cnb = metrics.accuracy_score(CNB.predict(x_test), y_test)
print('accuracy_score_cnb = '+str('{:4.2f}'.format(accuracy_score_cnb*100))+'%')

GNB.fit(x_train.todense(), y_train)
accuracy_score_gnb = metrics.accuracy_score(GNB.predict(x_test.todense()), y_test)
print('accuracy_score_gnb = '+str('{:4.2f}'.format(accuracy_score_gnb*100))+'%')

accuracy_score_mnb = 64.69%
accuracy_score_bnb = 67.90%
accuracy_score_cnb = 69.72%
accuracy_score_gnb = 55.78%


# Using SVC  with SGD 

In [27]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
SGDC = SGDClassifier()
LSVC = LinearSVC()

#on TF-IDF data
LSVC.fit(x_train, y_train)
accuracy_score_lsvc = metrics.accuracy_score(LSVC.predict(x_test), y_test)
print('accuracy_score_lsvc = '+str('{:4.2f}'.format(accuracy_score_lsvc*100))+'%')

SGDC.fit(x_train, y_train)
accuracy_score_sgdc = metrics.accuracy_score(SGDC.predict(x_test), y_test)
print('accuracy_score_sgdc = '+str('{:4.2f}'.format(accuracy_score_sgdc*100))+'%')

#on CountVectorize data
LSVC.fit(X_train, Y_train)
accuracy_score_lsvc_CV = metrics.accuracy_score(LSVC.predict(X_test), Y_test)
print('accuracy_score_lsvc_cv = '+str('{:4.2f}'.format(accuracy_score_lsvc_CV*100))+'%')

SGDC.fit(X_train, Y_train)
accuracy_score_sgdc_CV = metrics.accuracy_score(SGDC.predict(X_test), Y_test)
print('accuracy_score_sgdc_cv = '+str('{:4.2f}'.format(accuracy_score_sgdc_CV*100))+'%')

accuracy_score_lsvc = 76.73%
accuracy_score_sgdc = 77.06%
accuracy_score_lsvc_cv = 73.60%
accuracy_score_sgdc_cv = 72.61%
