# Project 4 Notebook: NLP Classification

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re

In [2]:
#Creation of Dataframe

tweets = pd.read_csv('./data/tweets.csv', encoding='unicode_escape')
tweets.shape

(9093, 3)

In [3]:
tweets.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
tweets.loc[0].tweet_text

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [5]:
#Creation of list containing Classification

classif = ['Positive', 'Negative', 'Neutral']

In [6]:
#Creation of predictied classification column

tweets['predicted_classif'] = [[] for x in tweets['tweet_text']]

In [7]:
tweets.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,predicted_classif
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,[]
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,[]
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,[]
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,[]
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,[]


## Data Cleaning

In [8]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   predicted_classif                                   9093 non-null   object
dtypes: object(4)
memory usage: 284.3+ KB


In [9]:
tweets['emotion_in_tweet_is_directed_at'].fillna('None', inplace=True)

In [10]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     9093 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   predicted_classif                                   9093 non-null   object
dtypes: object(4)
memory usage: 284.3+ KB


In [11]:
tweets['emotion_in_tweet_is_directed_at'].replace('None', 'Unknown', inplace = True)

In [12]:
tweets.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,predicted_classif
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,[]
9089,"Wave, buzz... RT @mention We interrupt your re...",Unknown,No emotion toward brand or product,[]
9090,"Google's Zeiger, a physician never reported po...",Unknown,No emotion toward brand or product,[]
9091,Some Verizon iPhone customers complained their...,Unknown,No emotion toward brand or product,[]
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,Unknown,No emotion toward brand or product,[]


In [13]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     9093 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   predicted_classif                                   9093 non-null   object
dtypes: object(4)
memory usage: 284.3+ KB


In [14]:
tweets['tweet_text'].replace('-', 'None', inplace = True)

In [26]:
tweets['tweet_text'].fillna('None', inplace = True)

In [27]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9093 non-null   object
 1   emotion_in_tweet_is_directed_at                     9093 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   predicted_classif                                   9093 non-null   object
dtypes: object(4)
memory usage: 284.3+ KB


## Tokenization

In [28]:
split_tweets = tweets['tweet_text'].str.split(' ')

In [38]:
X = split_tweets
y = tweets['is_there_an_emotion_directed_at_a_brand_or_product']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state=42)

In [40]:
X_train.head()

2954    [#UXdes, @mention, is, glad, there, are, no, s...
2709    [if, you, are, ready, to, take, your, tech, ge...
2357    [#saveustechies, panel, at, #sxsw, getting, of...
6195    [RT, @mention, Join, actsofsharing.com, tonigh...
1366    [@mention, -, spread, the, word,, our, #SXSW, ...
Name: tweet_text, dtype: object

In [41]:
y_train.head()

2954        I can't tell
2709    Positive emotion
2357    Positive emotion
6195    Positive emotion
1366    Positive emotion
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: object

In [42]:
y_train.value_counts()

No emotion toward brand or product    3792
Positive emotion                      2063
Negative emotion                       397
I can't tell                           113
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [43]:
lowered_tweets = split_tweets.str.lower()

In [44]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9093 non-null   object
 1   emotion_in_tweet_is_directed_at                     9093 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   predicted_classif                                   9093 non-null   object
dtypes: object(4)
memory usage: 284.3+ KB


In [45]:
noPunc_tweets = [s.translate(str.maketrans('', '', string.punctuation))\
                  for s in lowered_tweets]

AttributeError: 'float' object has no attribute 'translate'

In [None]:
basic_token_pattern = r"(?u)\b\w\w+\b"

tokenizer = RegexpTokenizer(basic_token_pattern)
tokenizer.tokenize(lowered_tweets)

In [None]:
tweets['tweet_text'].head()