In [1]:
import re

import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sakya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Training our own classifier

 - ### __Preprocessing__

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head(1)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral


In [4]:
data.dropna(subset=['text'], inplace=True)

Remove punctuation

In [5]:
data.text = data.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

Remove stopwords

In [6]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
data.text = data.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

Remove links

In [7]:
data.text = data.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

Stemming and Lemmatization

In [8]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [9]:
text = nltk.word_tokenize('women run running runs ran')
stemmed = [porter_stemmer.stem(word) for word in text]
stemmed

['women', 'run', 'run', 'run', 'ran']

In [10]:
lemmatizer = nltk.WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in text]
lemmatized

['woman', 'run', 'running', 'run', 'ran']

In [11]:
data.text = data.text.apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
        ) )

 - ### __Training a naive bayes sentiment classifier__

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

Use CountVectorizer to help us create a matrix for machine learning

In [13]:
vectorizer = CountVectorizer()

In [14]:
data.dropna(subset=['text'], inplace=True)
X = vectorizer.fit_transform(data.text)

Instantiate a multinomial naive Bayes model, then fit it.

In [15]:
nb = MultinomialNB()

#nb.fit(features_matrix, item_we_want_to_predict)
nb.fit(X, data.sentiment)

MultinomialNB()

### Apply the same transformation and use the same CountVectorizer that we fitted to the test data.

In [16]:
dataTest = pd.read_csv('test.csv')

In [17]:
dataTest.head(1)

Unnamed: 0,textID,text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh


In [18]:
dataTest.dropna(subset=['text'], inplace=True)

In [19]:
dataTest.text = dataTest.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

In [20]:
dataTest.text = dataTest.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [21]:
dataTest.text = dataTest.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

In [22]:
dataTest.text = dataTest.text.apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
        ) )

In [23]:
Y = vectorizer.transform(dataTest.text)

In [24]:
Y.shape


(3534, 26934)

In [25]:
X.shape

(27480, 26934)

### Do the prediction

In [26]:
predicted = nb.predict(Y)

In [27]:
predicted.shape

(3534,)

In [28]:
dfOutput = dataTest.drop(['text'], axis = 1)

In [29]:
dfOutput.head(5)

Unnamed: 0,textID
0,f87dea47db
1,96d74cb729
2,eee518ae67
3,01082688c6
4,33987a8ee5


In [30]:
dfOutput['sentiment'] = predicted

In [31]:
dfOutput.to_csv('Prediction.csv', index=False)

In [32]:
dfOutput.shape

(3534, 2)