In [34]:
# Import data and inspect it
import pandas as pd
df = pd.read_csv('Data/tweet_product_company.csv',  encoding = 'unicode_escape')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [35]:
# Rename columns' names
data = df[['tweet_text', 'is_there_an_emotion_directed_at_a_brand_or_product']]
data = data.rename(columns = {'tweet_text' : 'text',\
                              'is_there_an_emotion_directed_at_a_brand_or_product' : 'category'})

# Rename categories 'No emotion toward brand or product' and 'I cant tell' to 'Neutral emotion'
data['category'] = data['category'].replace("[',]", "", regex=True)
data['category'].replace('No emotion toward brand or product', 'Neutral emotion', inplace=True)
data['category'].replace('I cant tell', 'Neutral emotion', inplace=True)
data.head()

Unnamed: 0,text,category
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [36]:
data.isnull().sum()

text        1
category    0
dtype: int64

In [37]:
# Remove null values
data.dropna(inplace=True)

In [38]:
from sklearn.model_selection import train_test_split

# Split the outcome and predictors
y = data['category']
X = data['text']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Split the data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [39]:
print(y_train_final.value_counts(), '\n\n', 
      y_val.value_counts(), '\n\n', 
      y_test.value_counts())

Neutral emotion     4004
Positive emotion    2138
Negative emotion     403
Name: category, dtype: int64 

 Neutral emotion     984
Positive emotion    550
Negative emotion    103
Name: category, dtype: int64 

 Neutral emotion     556
Positive emotion    290
Negative emotion     64
Name: category, dtype: int64


In [40]:
# Save training sets as dataframes
X_train_final = pd.DataFrame(X_train_final, columns = ['text', 'category'])
X_val = pd.DataFrame(X_val, columns = ['text', 'category'])
X_test = pd.DataFrame(X_test, columns = ['text', 'category'])

In [41]:
# Transform data to lowercase
X_train_final['text'] = X_train_final['text'].str.lower()
X_val['text'] = X_val['text'].str.lower()
X_test['text'] = X_test['text'].str.lower()

In [42]:
# Remove hashtags and @mention
X_train_final['text'] = X_train_final['text'].str.replace('@[A-Za-z0-9_]+', '', regex=True)
X_train_final['text'] = X_train_final['text'].str.replace('#[A-Za-z0-9_]+', '', regex=True)

X_val['text'] = X_val['text'].str.replace('@[A-Za-z0-9_]+', '', regex=True)
X_val['text'] = X_val['text'].str.replace('#[A-Za-z0-9_]+', '', regex=True)

X_test['text'] = X_test['text'].str.replace('@[A-Za-z0-9_]+', '', regex=True)
X_test['text'] = X_test['text'].str.replace('#[A-Za-z0-9_]+', '', regex=True)

In [43]:
# Tokenize the data
import nltk
from nltk.tokenize import RegexpTokenizer

basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)
X_train_final['tokenized'] = X_train_final['text'].apply(tokenizer.tokenize)
X_val['tokenized'] = X_val['text'].apply(tokenizer.tokenize)
X_test['tokenized'] = X_test['text'].apply(tokenizer.tokenize)

In [44]:
# Remove stopwords
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english')
def remove_stopwords(token_list):
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed

X_train_final['no_stopwords'] = X_train_final['tokenized'].apply(remove_stopwords)
X_val['no_stopwords'] = X_val['tokenized'].apply(remove_stopwords)
X_test['no_stopwords'] = X_test['tokenized'].apply(remove_stopwords)

In [45]:
# Download the collection of tags for classifying part-of-speech
import nltk
nltk.download('tagsets')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to /Users/olga/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [46]:
# Tag the words from the text
from nltk import pos_tag

X_train_final['tagged'] = X_train_final.no_stopwords.apply(pos_tag)
X_val['tagged'] = X_val.no_stopwords.apply(pos_tag)
X_test['tagged'] = X_test.no_stopwords.apply(pos_tag)

In [47]:
# Transform the tags into the tags of our lemmatizers
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def tagged(token_list):
    doc_tagged = [(token[0], get_wordnet_pos(token[1])) for token in token_list]
    return doc_tagged

X_train_final['tagged'] = X_train_final['tagged'].apply(tagged)
X_val['tagged'] = X_val['tagged'].apply(tagged)
X_test['tagged'] = X_test['tagged'].apply(tagged)

In [48]:
# Lemmatize the text
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(token[0], token[1]) for token in text]

X_train_final['lemmatized'] = X_train_final.tagged.apply(lemmatize_text)
X_val['lemmatized'] = X_val.tagged.apply(lemmatize_text)
X_test['lemmatized'] = X_test.tagged.apply(lemmatize_text)

In [49]:
# Transform lemmatized tokens into lemmatized text
def lemmatized(text):
    return " ".join(text)

X_train_final['lemmatized_text'] = X_train_final.lemmatized.apply(lemmatized)
X_val['lemmatized_text'] = X_val.lemmatized.apply(lemmatized)
X_test['lemmatized_text'] = X_test.lemmatized.apply(lemmatized)

In [60]:
X_train_processed = X_train_final['lemmatized_text']
X_train_processed.to_csv('Data/X_train_final.csv', index=False)

In [62]:
X_train_final['lemmatized_text'].value_counts()

rt google launch major new social network call circle possibly today link           18
google launch major new social network call circle possibly today link              13
google launch major new social network call circle possibly today link via          12
rt marissa mayer google connect digital amp physical world mobile link               9
rt new app store include uberguide sponsor link                                      7
                                                                                    ..
apple open popup store ipad launch genius bastard link                               1
rt quot classy fascist company america really elegant quot                           1
toolkit laptop charger iphone h2o bike amp shade walk apple ad touch rei             1
secret search sauce google bing spill bean link                                      1
guy next carefully study quot best party 11 quot spreadsheet make ipad ipad n00b     1
Name: lemmatized_text, Length: 6260, dtype:

In [61]:
df = pd.read_csv('Data/X_train_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6545 entries, 0 to 6544
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   lemmatized_text  6543 non-null   object
dtypes: object(1)
memory usage: 51.3+ KB


In [54]:
X_val_processed = X_val['lemmatized_text']
X_val_processed.to_csv('Data/X_val.csv', index=False)

In [55]:
X_test_processed = X_test['lemmatized_text']
X_test_processed.to_csv('Data/X_test.csv', index=False)

In [56]:
y_train_final.to_csv('Data/y_train_final.csv', index=False)
y_val.to_csv('Data/y_val.csv', index=False)
y_test.to_csv('Data/y_test.csv', index=False)