# First we define some functions to preprocess the text documents before we do the training.


In [1]:
import re           
"""
    we need re for basic string pattern 
    like removing non alphabet characters in removeNonAlpha function
"""
import nltk
stop_words_set = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def removeNonAlpha(s: str):
    return re.sub(r'[^A-Za-z\s]', '', s)
"""
    here because to apply three operations (remove stop words, stemming and lemmatizing)
    we need to tokenize the whole string and then concat all tokens at the end. we did all three operations in one functions
    to avoid repeating tokenizing and joining tokens ... 
"""



def removeStopWordsAndStemAndLemmatize(s: str):
    return ' '.join([stemmer.stem(lemmatizer.lemmatize(token)) for token in nltk.tokenize.word_tokenize(s) if not token in stop_words_set])
"""
    In this function we first lemmatize the token. for example it converts "cities" -> "city"
    or some irregular cases like: "mice" -> "mouse"

    Then we stemm the token. somehow it converts some tokens to their root. 
    like: several -> sever   

    And then we add it to the list only if it's not a stop word. then we join all tokens with ' ' and return the string

    Reference for preprocessing: 
        https://github.com/aravinthsci/Text-Preprocessing-in-Python/blob/master/Text_Preprocessing_in_Python.ipynb
"""





def strPreProccess(s: str):
    return removeStopWordsAndStemAndLemmatize(removeNonAlpha(s.strip().lower()))
"""
A funciton that does all the job at once
"""


'\nA funciton that does all the job at once\n'

# First the "SandersPosNeg" dataset

## Now we read the data from the file.

In [2]:

import pandas as pd

data = pd.read_csv("./SandersPosNeg.csv", sep='\t', header=None) # we have no header in dataset so set header=None
data # a preview of the dataset

Unnamed: 0,0,1
0,0,RT @cjwallace03: So apparently @apple put MB c...
1,0,RT @Jewelz2611 @mashable @apple iphones r 2 ex...
2,0,@mashable @apple iphones r 2 expensive. Most w...
3,0,THiS IS WHAT WiLL KiLL APPLE http://t.co/72Jw4...
4,4,Now all @Apple has to do is get swype on the i...
...,...,...
1219,4,RT @ahhfuckitsguss: #twitter can be so useful ...
1220,4,My 3 biggest obsessions: #twitter #dancemoms a...
1221,0,My mentions aren't showing properly ... PAY AT...
1222,4,#twitter is jumpin as usual :)


## As you see the texts need to be proccessed and get ready for training(i checked and saw it increases the accuracy about 2 to 3 percent)

### For a better understanding of dataset we rename the columns
#### And then apply the preprocessing to the tweet text column 

In [3]:
data.rename(columns={0: "label", 1: "tweet text"}, inplace=True)
data['tweet text'] = data['tweet text'].apply(strPreProccess)
data # a preview of data

Unnamed: 0,label,tweet text
0,0,rt cjwallac appar appl put mb cap sm new updat...
1,0,rt jewelz mashabl appl iphon r expens went w h...
2,0,mashabl appl iphon r expens went w htcgalaxi c...
3,0,kill appl httptcojwzc rip appl
4,4,appl get swype iphon crack iphon
...,...,...
1219,4,rt ahhfuckitsguss twitter use let feel cant re...
1220,4,biggest obsess twitter dancemom desperatehouse...
1221,0,mention arent show properli pay attent twitter
1222,4,twitter jumpin usual


## As you can see the data is now ready to train

### First we import some neccesary libraries

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ShuffleSplit, cross_val_score
import sklearn.naive_bayes as NB

### now before we use a predicter we need to convert the text to numerical values to work on
#### To do this we use Tf-idf vectorization

In [5]:
vectorizer = tfidf_vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['tweet text'])
Y = data['label']



### Then we create an Multinomial Naive Bayse model instance to give to the cross_val_score function so it fits the data into given model and measure our accuracy using 10-fold-cross-validation method. 

#### (Obviously the X will be the vectors(converted from tweet texts) 
#### and Y will be labels (0 for negative and 4 for positive))

In [6]:
model = NB.MultinomialNB()
NB_result = cross_val_score(model, X, Y, cv=ShuffleSplit(10, test_size=0.2, random_state=0)).mean()
print(f'NB: {NB_result.mean()*100}')

NB: 84.08163265306122


# Now we do the same thing with the OMD dataset. 
### Except for some tiny details. for example we need to use "mac_roman" encoding.
### Also the csv file is seperated by camma which also appeare in tweet texts so setting the sep=',' would cause a mess!
### To handle the situation first we read the whole line in a step.(we will have one column)
### Next step we split it by the first camma we see(it seperates the label and tweet text)


In [7]:
data = pd.read_csv("./OMD.csv", header=None, sep='\t', encoding='mac_roman')

def getLabel(s: str) -> int:
    return int(s[:s.find(',')])

def getTweet(s: str) -> str:
    return s[s.find(','):]

data['label'] = data[0].apply(getLabel)
data['tweet text'] = data[0].apply(getTweet)
del data[0]
data


Unnamed: 0,label,tweet text
0,0,",Watching by myself #tweetdebate Not drinking..."
1,0,",@ahg3 @MichDot Yeah, slime was actually my se..."
2,0,",Preparing to have a heart attack #tweetdebate,"
3,0,",no debate moderators under 50, sorry #tweetd..."
4,0,",@current Now staring at black screen on http:..."
...,...,...
1901,4,",@Imarilove Yes, He did. I liked the eye conta..."
1902,4,",Bloggers right; mccain won because it is his ..."
1903,4,",Anyone trying to say John McCain is a liar, s..."
1904,4,",CNN post debate polling is saying the debate ..."


# And from now on everything is the same as SandersPosNeg dataset

In [8]:
data["tweet text"] = data['tweet text'].apply(strPreProccess)
data

Unnamed: 0,label,tweet text
0,0,watch tweetdeb drink wait start cring mccain b...
1,0,ahg michdot yeah slime actual second choic can...
2,0,prepar heart attack tweetdeb
3,0,debat moder sorri tweetdeb
4,0,current stare black screen httpwwwcurrentcomde...
...,...,...
1901,4,imarilov ye like eye contact debat tweetdeb
1902,4,blogger right mccain issu debat
1903,4,anyon tri say john mccain liar suffer misanthr...
1904,4,cnn post debat poll say debat went obama point...


## One tiny point! we set the alpha parameter here in our MultinomialNB to 0.49 so it gives us about 2 percent more accuracy!

In [9]:
vectorizer = tfidf_vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['tweet text'])
Y = data['label']

model = NB.MultinomialNB(alpha=0.49)
NB_result = cross_val_score(model, X, Y, cv=ShuffleSplit(10, test_size=0.2, random_state=0)).mean()
print(f'NB: {NB_result.mean()*100}')


NB: 75.81151832460733


In [10]:
(84+75.8)/2


79.9