#      ****     SENTIMENT ANALYSIS OF TWEETS       ****

In [12]:
import re
import string
import numpy as np
import pandas as pd
from porter_stemmer import PorterStemmer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS



### Data Reading

In [None]:
data = pd.read_csv("./input_data/tweet_data.csv",names = ["sentiment","date","user","xgfg","dxgfd","tweet"] , encoding='latin-1')
data = data.drop(columns=["date","user","xgfg","dxgfd"])
data.head(10)
print (data.iloc[:10,:])

### Cleaning of data

In [8]:
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    
    #remove @username
    tweet = re.sub('@[^\s]+','',tweet)
    
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # To lowercase
    tweet = tweet.lower()
    
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    
    # Remove hashtags
#     tweet = re.sub(r'#\w*', '', tweet)

    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', tweet)
    
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    tweet = re.sub(' +', ' ',tweet)
    
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')  
    
    # Removing Stopwords from tweet using sklearn.feature_extraction
    split_list = tweet.split(" ")
    tweet = [ word for word in split_list if word not in stop_words.ENGLISH_STOP_WORDS ]
    
    # Stemming the 
    ps = PorterStemmer()
    tweet = [ ps.stem(word) for word in tweet ]
    tweet = ' '.join(tweet)
    return tweet




processed_data = list()

for index, row in data.iterrows():
    processed_data.append(processTweet(row['tweet']))
    
# data['processed'] = processed_data
# data.head()                                    

In [11]:
data['processed'] = processed_data
data.head()

Unnamed: 0,sentiment,tweet,processed
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr dai
1,0,is upset that he can't update his Facebook by ...,upset updat facebook tex result school todai b...
2,0,@Kenichan I dived many times for the ball. Man...,dive time ball manag save rest bound
3,0,my whole body feels itchy and like its on fire,bodi feel itchi like
4,0,"@nationwideclass no, it's not behaving at all....",behav mad


### Train-Test-Validation Split

In [6]:
X = data['processed']
Y = data['sentiment']
X_train_val, X_test , Y_train_val, Y_test = train_test_split(X,Y,test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val,Y_train_val,test_size=0.25)


In [9]:
# def tokenize(str):
#     return str.split(" ")

# def stemming(list_of_tokens):
#     ps = PorterStemmer()
#     return [ ps.stem(word) for word in list_of_tokens ] 

# word_list = []
# for st in data.iloc[:10,1]:
#     tokens = tokenize(st)
#     stemmed = stemming(tokens)
#     for wd in stemmed:
#         if wd in ENGLISH_STOP_WORDS:
# #             print wd
#             stemmed.remove(wd)
#         if wd == '':
#             stemmed.remove(wd)
#     word_list += [stemmed]
    
# print word_list

In [7]:
# def remove_pattern(input_txt, pattern): 
#     r = re.findall(pattern, input_txt)
#     for i in r:
#         input_txt = re.sub(i, '', input_txt)
        
#     return input_txt

# # remove twitter handles (@user)
# data.iloc[:10,1] = np.vectorize(remove_pattern)(data.iloc[:10,1], "@[\w]*")
# print (data.iloc[:10,1])

# # # remove url
# # data.iloc[:10,1] = np.vectorize(remove_pattern)(data.iloc[:10,1],)# "^https?:\/\/.*[\r\n]*")#, text, flags=re.MULTILINE))
# # print (data.iloc[:10,1])

# # remove special characters, numbers, punctuations
# data.iloc[:10,1] = data.iloc[:10,1].str.replace("[^a-zA-Z#]", " ")
# print (data.iloc[:10,1])