### Loading Data

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
train_data = pd.read_csv(r"C:\Users\HP\Twitter Sentiment Analysis\input\train.csv",encoding='ISO-8859-1')
train_data.shape

(99989, 3)

In [4]:
train_data

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...
...,...,...,...
99984,99996,0,@Cupcake seems like a repeating problem hop...
99985,99997,1,@cupcake__ arrrr we both replied to each other...
99986,99998,0,@CuPcAkE_2120 ya i thought so
99987,99999,1,@Cupcake_Dollie Yes. Yes. I'm glad you had mor...


In [5]:
train_data.drop_duplicates(subset='SentimentText', keep=False, inplace=True)

In [6]:
train_data

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...
...,...,...,...
99984,99996,0,@Cupcake seems like a repeating problem hop...
99985,99997,1,@cupcake__ arrrr we both replied to each other...
99986,99998,0,@CuPcAkE_2120 ya i thought so
99987,99999,1,@Cupcake_Dollie Yes. Yes. I'm glad you had mor...


### Visualizing the tweets

In [7]:
#Taking a look at random tweets to gain insights
rand_indexs = np.random.randint(1,len(train_data),50).tolist()
train_data["SentimentText"][rand_indexs]

32156    @AfricanKueen87    don't think im doin much th...
43401                       @angelzdope Don't like coffee 
91889    @colinm53 Get 100 followers a day using www.tw...
68044    @amylovegrove I'm a little drunk and warm  close?
6786     #myweakness  Is music and i live to meet the p...
65079                                       @bjandcompany 
62436     @bigwormy sounds interesting, whats that about? 
92932    @bashsash fridayyy!  so so weird. what about y...
17667              ...ready to be confused all over again 
69825    @bunnyteeth ps! would you like to make me a ba...
29556    @acummings we had rain for days on end so you ...
79332    @camillejaiden Dude.  Wow   Now I kind of sort...
9228       #bigupz to @cbondemand for knowing how to cook 
69369    @bradjward you had to get me going on a gorgeo...
16177                                  ... Baby got Bumbo 
22696                                @0mie I would never! 
12855    *spazzes* I also desperately wana see &quot;Ha.

#### Emoticons
The internet language includes so many emoticons, people also tend to create their own, so we will first analyze the emoticons included in our dataset, try to classify them as happy and said, and make sure that our model know about them.

In [8]:
#Finding what emoticons have been used in out dataset
import re
tweets_text = train_data.SentimentText.str.cat()
emos = set(re.findall(r" ([xX:;][-']?.) ",tweets_text))
emos_count = []
for emo in emos:
    emos_count.append((tweets_text.count(emo), emo))
sorted(emos_count,reverse=True)

[(3281, ':/'),
 (2874, 'x '),
 (2626, ': '),
 (1339, 'x@'),
 (1214, 'xx'),
 (1162, 'xa'),
 (984, ';3'),
 (887, 'xp'),
 (842, 'xo'),
 (713, ';)'),
 (483, 'xe'),
 (431, ';I'),
 (353, ';.'),
 (254, 'xD'),
 (251, 'x.'),
 (245, '::'),
 (234, 'X '),
 (217, ';t'),
 (209, ';s'),
 (185, ':O'),
 (176, ':3'),
 (166, ';D'),
 (159, ":'"),
 (157, 'XD'),
 (146, 'x3'),
 (142, ':p'),
 (126, ":'("),
 (118, ':@'),
 (117, 'xh'),
 (117, ':S'),
 (109, 'xm'),
 (104, ';p'),
 (104, ';-)'),
 (92, ':|'),
 (91, 'x,'),
 (89, ';P'),
 (76, 'xd'),
 (75, ';o'),
 (75, ';d'),
 (71, ':o'),
 (65, 'XX'),
 (63, ':L'),
 (59, 'Xx'),
 (59, ':1'),
 (58, ':]'),
 (57, ':s'),
 (56, ':0'),
 (54, 'XO'),
 (44, ';;'),
 (43, ';('),
 (38, ':-D'),
 (37, 'xk'),
 (36, 'XT'),
 (35, 'x?'),
 (35, 'x)'),
 (34, 'x2'),
 (33, ';/'),
 (32, 'x:'),
 (32, ':\\'),
 (31, 'x-'),
 (27, 'Xo'),
 (27, 'XP'),
 (27, ':-/'),
 (26, ':-P'),
 (25, ':*'),
 (23, 'xX'),
 (22, ":')"),
 (17, 'xP'),
 (16, ':['),
 (16, ':-p'),
 (14, 'x]'),
 (14, 'XM'),
 (13, ':-O'),
 (1

We should by now know which emoticons are used (and its frequency) to build two regex, one for the happy ones and another for the sad ones. We will then use them in the preprocessing process to mark them as using happy emoticons or sad ones.

In [9]:
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "
print("Happy emoticons:", set(re.findall(HAPPY_EMO, tweets_text)))
print("Sad emoticons:", set(re.findall(SAD_EMO, tweets_text)))

Happy emoticons: {';d', ';-D', ';p', ':d', 'XD', ';)', 'x)', ':D', 'xd', ';P', ';D', ':p', 'xD', ';-)', ':-D'}
Sad emoticons: {':|', ":'(", ':/', ':('}


#### Most used Words
What we are going to do next is to define a function that will show us top words, so we may fix things before running our learning algorithm. This function takes as input a text and output words sorted according to their frequency, starting with the most used word.

In [10]:
import nltk
from nltk.tokenize import word_tokenize

#nltk.download('punkt')
def most_used_words(text):
    tokens = word_tokenize(text)
    frequency_dist = nltk.FreqDist(tokens)
    print("There is %d different words" % len(set(tokens)))
    return sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)

In [11]:
most_used_words(train_data.SentimentText.str.cat())[:100]

There is 128915 different words


['@',
 '!',
 '.',
 'I',
 ',',
 'to',
 'the',
 'you',
 '?',
 'a',
 'it',
 'i',
 ';',
 'and',
 '&',
 '...',
 'my',
 'for',
 'is',
 'that',
 "'s",
 "n't",
 'in',
 'me',
 'of',
 'have',
 'on',
 'quot',
 "'m",
 'so',
 ':',
 'but',
 '#',
 'do',
 'was',
 'be',
 '..',
 'not',
 'your',
 'are',
 'just',
 'with',
 'like',
 '-',
 'at',
 '*',
 'too',
 'get',
 'good',
 'u',
 'up',
 'know',
 'all',
 'this',
 'now',
 'no',
 'we',
 'out',
 ')',
 'love',
 'lol',
 'can',
 'what',
 'one',
 '(',
 'will',
 'go',
 'about',
 'did',
 'got',
 "'ll",
 'there',
 'amp',
 'day',
 'http',
 'see',
 "'re",
 'if',
 'time',
 'they',
 'think',
 'as',
 'when',
 'from',
 'You',
 'It',
 'going',
 'really',
 'well',
 'am',
 'work',
 'had',
 'would',
 'how',
 'he',
 'here',
 'thanks',
 'some',
 '....',
 'haha']

#### Stop Words
What we can see is that stop words are the most used, but in fact they don't help us determine if a tweet is happy/sad, however, they are consuming memory and they are making the learning process slower, so we really need to get rid of them.

In [12]:
from nltk.corpus import stopwords

#nltk.download("stopwords")

mw = most_used_words(train_data.SentimentText.str.cat())
most_words = []
for w in mw:
    if len(most_words) == 1000:
        break
    if w in stopwords.words("english"):
        continue
    else:
        most_words.append(w)

There is 128915 different words


In [13]:
#Getting a look over the top 1000 words
sorted(most_words)

['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '--',
 '.',
 '..',
 '...',
 '....',
 '.....',
 '......',
 '/',
 '1',
 '10',
 '100',
 '12',
 '1st',
 '2',
 '20',
 '2nd',
 '3',
 '30',
 '30SECONDSTOMARS',
 '4',
 '5',
 '6',
 '7',
 '8',
 ':',
 ';',
 '=',
 '?',
 '@',
 'A',
 'AND',
 'Ah',
 'AlexAllTimeLow',
 'All',
 'Also',
 'Alyssa_Milano',
 'Am',
 'And',
 'Are',
 'As',
 'At',
 'Aw',
 'Awesome',
 'Aww',
 'Awww',
 'BSB',
 'Birthday',
 'But',
 'Ca',
 'Can',
 'Chris',
 'Come',
 'Congrats',
 'Cool',
 'D',
 'DM',
 'DO',
 'Damn',
 'Day',
 'Did',
 'Do',
 'Enjoy',
 'FF',
 'Follow',
 'FollowFriday',
 'For',
 'Friday',
 'Get',
 'Glad',
 'Go',
 'God',
 'Good',
 'Got',
 'Great',
 'Had',
 'Haha',
 'Happy',
 'Have',
 'He',
 'Hello',
 'Hey',
 'Hi',
 'Hope',
 'How',
 'I',
 'IS',
 'IT',
 'If',
 'Im',
 'In',
 'Is',
 'It',
 'Its',
 'July',
 'June',
 'Just',
 'Keep',
 'LA',
 'LMAO',
 'LOL',
 'LOVE',
 'Let',
 'Like',
 'Lol',
 'London',
 'Love',

#### Stemming
As we can observe, there are several words having the same meaning but have been written in a different manner,sometimes in the plural from and sometimes in the suffix. This would make our model think that they are different words,also making our vocab bigger, thus wasting memory and time for the llearning process. 

In [14]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

#nltk.download('wordnet')

def stem_tokenize(text):
    stemmer = SnowballStemmer("english")
    stemmer = WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(text)]

def lemmatize_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

In [15]:
##Inserted later
stem_tokenize(train_data.SentimentText.str.cat())

['is',
 'so',
 'sad',
 'for',
 'my',
 'APL',
 'friend',
 '.............',
 'I',
 'missed',
 'the',
 'New',
 'Moon',
 'trailer',
 '...',
 'omg',
 'it',
 'already',
 '7:30',
 ':',
 'O',
 '..',
 'Omgaga',
 '.',
 'Im',
 'sooo',
 'im',
 'gunna',
 'CRy',
 '.',
 'I',
 "'ve",
 'been',
 'at',
 'this',
 'dentist',
 'since',
 '11',
 '..',
 'I',
 'wa',
 'suposed',
 '2',
 'just',
 'get',
 'a',
 'crown',
 'put',
 'on',
 '(',
 '30mins',
 ')',
 '...',
 'i',
 'think',
 'mi',
 'bf',
 'is',
 'cheating',
 'on',
 'me',
 '!',
 '!',
 '!',
 'T_T',
 'or',
 'i',
 'just',
 'worry',
 'too',
 'much',
 '?',
 'Juuuuuuuuuuuuuuuuussssst',
 'Chillin',
 '!',
 '!',
 'Sunny',
 'Again',
 'Work',
 'Tomorrow',
 ':',
 '-|',
 'TV',
 'Tonight',
 'handed',
 'in',
 'my',
 'uniform',
 'today',
 '.',
 'i',
 'miss',
 'you',
 'already',
 'hmmmm',
 '....',
 'i',
 'wonder',
 'how',
 'she',
 'my',
 'number',
 '@',
 '-',
 ')',
 'I',
 'must',
 'think',
 'about',
 'positive',
 '..',
 'thanks',
 'to',
 'all',
 'the',
 'hater',
 'up',
 'in',

In [16]:
#inserted later
lemmatize_tokenize(train_data.SentimentText.str.cat())

['is',
 'so',
 'sad',
 'for',
 'my',
 'APL',
 'friend',
 '.............',
 'I',
 'missed',
 'the',
 'New',
 'Moon',
 'trailer',
 '...',
 'omg',
 'it',
 'already',
 '7:30',
 ':',
 'O',
 '..',
 'Omgaga',
 '.',
 'Im',
 'sooo',
 'im',
 'gunna',
 'CRy',
 '.',
 'I',
 "'ve",
 'been',
 'at',
 'this',
 'dentist',
 'since',
 '11',
 '..',
 'I',
 'wa',
 'suposed',
 '2',
 'just',
 'get',
 'a',
 'crown',
 'put',
 'on',
 '(',
 '30mins',
 ')',
 '...',
 'i',
 'think',
 'mi',
 'bf',
 'is',
 'cheating',
 'on',
 'me',
 '!',
 '!',
 '!',
 'T_T',
 'or',
 'i',
 'just',
 'worry',
 'too',
 'much',
 '?',
 'Juuuuuuuuuuuuuuuuussssst',
 'Chillin',
 '!',
 '!',
 'Sunny',
 'Again',
 'Work',
 'Tomorrow',
 ':',
 '-|',
 'TV',
 'Tonight',
 'handed',
 'in',
 'my',
 'uniform',
 'today',
 '.',
 'i',
 'miss',
 'you',
 'already',
 'hmmmm',
 '....',
 'i',
 'wonder',
 'how',
 'she',
 'my',
 'number',
 '@',
 '-',
 ')',
 'I',
 'must',
 'think',
 'about',
 'positive',
 '..',
 'thanks',
 'to',
 'all',
 'the',
 'hater',
 'up',
 'in',

In [17]:
##inserted later
train_data.drop_duplicates(subset = 'SentimentText',keep = False, inplace = True)

In [18]:
##Inserted later
train_data['lemmatized_SentimentText'] = train_data['SentimentText'].apply(lambda x: lemmatize_tokenize(x))

In [19]:
##Inserted later
#train_data.lemmatized_SentimentText.str.cat()[:100]
revised_data = []
for i in train_data['lemmatized_SentimentText']:
    revised_data.append(i)

In [20]:
revised_data

[['is', 'so', 'sad', 'for', 'my', 'APL', 'friend', '.............'],
 ['I', 'missed', 'the', 'New', 'Moon', 'trailer', '...'],
 ['omg', 'it', 'already', '7:30', ':', 'O'],
 ['..',
  'Omgaga',
  '.',
  'Im',
  'sooo',
  'im',
  'gunna',
  'CRy',
  '.',
  'I',
  "'ve",
  'been',
  'at',
  'this',
  'dentist',
  'since',
  '11',
  '..',
  'I',
  'wa',
  'suposed',
  '2',
  'just',
  'get',
  'a',
  'crown',
  'put',
  'on',
  '(',
  '30mins',
  ')',
  '...'],
 ['i',
  'think',
  'mi',
  'bf',
  'is',
  'cheating',
  'on',
  'me',
  '!',
  '!',
  '!',
  'T_T'],
 ['or', 'i', 'just', 'worry', 'too', 'much', '?'],
 ['Juuuuuuuuuuuuuuuuussssst', 'Chillin', '!', '!'],
 ['Sunny', 'Again', 'Work', 'Tomorrow', ':', '-|', 'TV', 'Tonight'],
 ['handed',
  'in',
  'my',
  'uniform',
  'today',
  '.',
  'i',
  'miss',
  'you',
  'already'],
 ['hmmmm', '....', 'i', 'wonder', 'how', 'she', 'my', 'number', '@', '-', ')'],
 ['I', 'must', 'think', 'about', 'positive', '..'],
 ['thanks',
  'to',
  'all',
  't

In [21]:
##revised_data.drop_duplicates(subset='')

### Preparing Data

#### Bag of Words
We are going to use the Bag of Words algorithm, which basically takes a text as input, extract words from it (this is our vocabulary) to use them in the vectorization process. When a tweet comes in, it will vectorize it by counting the number of occurrences of each word in our vocabulary.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Building the pipeline

In [23]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

In [25]:
class TextPreProc(BaseEstimator,TransformerMixin):
    def __init__(self, use_mention=False):
        self.use_mention = use_mention
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # We can choose between keeping the mentions
        # or deleting them
        if self.use_mention:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", " @tags ")
        else:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", "")
            
        # Keeping only the word after the #
        X = X.str.replace("#", "")
        X = X.str.replace(r"[-\.\n]", "")
        # Removing HTML garbage
        X = X.str.replace(r"&\w+;", "")
        # Removing links
        X = X.str.replace(r"https?://\S*", "")
        # replace repeated letters with only two occurences
        # heeeelllloooo => heelloo
        X = X.str.replace(r"(.)\1+", r"\1\1")
        # mark emoticons as happy or sad
        X = X.str.replace(HAPPY_EMO, " happyemoticons ")
        X = X.str.replace(SAD_EMO, " sademoticons ")
        X = X.str.lower()
        return X
    

In [26]:
##inserted later
#trans(train_data.SentimentText.str.cat())
train_data

Unnamed: 0,ItemID,Sentiment,SentimentText,lemmatized_SentimentText
0,1,0,is so sad for my APL frie...,"[is, so, sad, for, my, APL, friend, ............."
1,2,0,I missed the New Moon trail...,"[I, missed, the, New, Moon, trailer, ...]"
2,3,1,omg its already 7:30 :O,"[omg, it, already, 7:30, :, O]"
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[.., Omgaga, ., Im, sooo, im, gunna, CRy, ., I..."
4,5,0,i think mi bf is cheating on me!!! ...,"[i, think, mi, bf, is, cheating, on, me, !, !,..."
...,...,...,...,...
99984,99996,0,@Cupcake seems like a repeating problem hop...,"[@, Cupcake, seems, like, a, repeating, proble..."
99985,99997,1,@cupcake__ arrrr we both replied to each other...,"[@, cupcake__, arrrr, we, both, replied, to, e..."
99986,99998,0,@CuPcAkE_2120 ya i thought so,"[@, CuPcAkE_2120, ya, i, thought, so]"
99987,99999,1,@Cupcake_Dollie Yes. Yes. I'm glad you had mor...,"[@, Cupcake_Dollie, Yes, ., Yes, ., I, 'm, gla..."


In [27]:
from sklearn.model_selection import train_test_split

sentiments = train_data['Sentiment']
tweets = train_data['SentimentText']

# I get those parameters from the 'Fine tune the model' part
vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenize, ngram_range=(1,2))
pipeline = Pipeline([
    ('text_pre_processing', TextPreProc(use_mention=True)),
    ('vectorizer', vectorizer),
])

#### Splitting the data into learning and testing set

In [28]:
learn_data, test_data, sentiments_learning, sentiments_test = train_test_split(tweets, sentiments, test_size=0.3)

In [29]:
#transforming learning data from simple text into vector
learning_data = pipeline.fit_transform(learn_data)

### Selecting a model

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

lr = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()

models = {
    'logitic regression': lr,
    'bernoulliNB': bnb,
    'multinomialNB': mnb,
}

for model in models.keys():
    scores = cross_val_score(models[model], learning_data, sentiments_learning, scoring="f1", cv=10)
    print("===", model, "===")
    print("scores = ", scores)
    print("mean = ", scores.mean())
    print("variance = ", scores.var())
    models[model].fit(learning_data, sentiments_learning)[:8000]
    print("score on the learning data (accuracy) = ", accuracy_score(models[model].predict(learning_data), sentiments_learning))
    print("")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

=== logitic regression ===
scores =  [0.8082076  0.80792273 0.81591074 0.81273454 0.81613372 0.8092978
 0.80983527 0.81401813 0.81273637 0.81819281]
mean =  0.8124989711572963
variance =  1.1639177628605304e-05


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


TypeError: 'LogisticRegression' object is not subscriptable

Choosing MultinomialNB

### Fine tuning the model

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
grid_search_pipeline = Pipeline([
    ('text_pre_processing', TextPreProc()),
    ('vectorizer', TfidfVectorizer()),
    ('model', MultinomialNB())
])

params = [
    {
        'text_pre_processing__use_mention' : [True, False],
        'vectorizer__max_features' : [1000, 2000, 5000, 10000, 20000, None],
        'vectorizer__ngram_range' : [(1,1), (1,2)],
    },
]


In [35]:
grid_search = GridSearchCV(grid_search_pipeline, params, cv=5, scoring='f1')
grid_search.fit(learn_data, sentiments_learning)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('text_pre_processing', TextPreProc()),
                                       ('vectorizer', TfidfVectorizer()),
                                       ('model', MultinomialNB())]),
             param_grid=[{'text_pre_processing__use_mention': [True, False],
                          'vectorizer__max_features': [1000, 2000, 5000, 10000,
                                                       20000, None],
                          'vectorizer__ngram_range': [(1, 1), (1, 2)]}],
             scoring='f1')

In [36]:
print(grid_search.best_params_)

{'text_pre_processing__use_mention': True, 'vectorizer__max_features': None, 'vectorizer__ngram_range': (1, 2)}


### Testing

In [37]:
mnb.fit(learning_data, sentiments_learning)

MultinomialNB()

In [38]:
testing_data = pipeline.transform(test_data)
mnb.score(testing_data, sentiments_test)

0.755075507550755

In [39]:
#predicting on test.csv
sub_data = pd.read_csv("C:/Users/HP/Twitter Sentiment Analysis/input/test.csv", encoding='ISO-8859-1')
sub_learning = pipeline.transform(sub_data.SentimentText)
sub = pd.DataFrame(sub_data.ItemID, columns=("ItemID", "Sentiment"))
sub["Sentiment"] = mnb.predict(sub_learning)
print(sub)

        ItemID  Sentiment
0            1          0
1            2          0
2            3          1
3            4          0
4            5          0
...        ...        ...
299984  299996          1
299985  299997          1
299986  299998          1
299987  299999          1
299988  300000          1

[299989 rows x 2 columns]


In [None]:
#
#mnb.score(sub_learning,sentiments_test )