In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import *
import re
from nltk.tokenize import TweetTokenizer
import string

In [2]:
df = pd.read_csv('Tweets.csv', header = 'infer')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [4]:
df.dropna(inplace = True)

In [5]:
df.sentiment.value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [6]:
data = df[df.sentiment!='neutral']

In [7]:
data

Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
...,...,...,...,...
27475,b78ec00df5,enjoy ur night,enjoy,positive
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive


In [8]:
data['sentiment'] = data['sentiment'].replace(['negative'], 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['sentiment'].replace(['negative'], 0)


In [9]:
data['sentiment'] = data['sentiment'].replace(['positive'], 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['sentiment'].replace(['positive'], 1)


In [10]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [11]:
def build_freqs(tweets, ys):
    # Convert np array to list since zip needs an iterable.
    yslist = np.squeeze(ys).tolist()

    #Count freqs and generate dictionary
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [12]:
freqs = build_freqs(data.selected_text,data.sentiment)

In [13]:
X = np.zeros((16363,3))

In [14]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0),0)
        
    assert(x.shape == (1, 3))
    return x

In [15]:
data.reset_index(inplace = True)

In [16]:
X = np.zeros((16363, 3)) 

for i in range(16363):
    X[i,:] = extract_features(data.selected_text[i],freqs,process_tweet=process_tweet)

In [17]:
y = data.sentiment

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [19]:
model = LogisticRegression()

In [20]:
model.fit(X_train,y_train)

In [21]:
model.coef_

array([[-0.07627965,  0.0076406 , -0.01041468]])

In [22]:
model.intercept_

array([-0.07627986])

In [23]:
y_pred = model.predict(X_test)

In [24]:
y_pred

array([0, 0, 1, ..., 0, 0, 1])

In [25]:
accuracy_score(y_test,y_pred)

0.8353192789489765

In [27]:
pd.set_option('display.max_colwidth', None)
data[data.index.isin(y_test[y_pred!=y_test].index.sort_values())]

Unnamed: 0,index,textID,text,selected_text,sentiment
73,125,6649f3558c,Not a prob hun,Not a prob,1
127,217,ca832cad51,Feeling smooth like chrome,Feeling smooth,1
131,226,151c10cc39,"JONAS BROTHERS - Live to party. It`s rocking so hard I love the song,",JONAS BROTHERS - Live to party. It`s rocking so hard,1
198,346,a97db072ed,Gonna celebrate Mothers Day with the family but gonna start the partying tonite,partying,1
210,362,b94aaf845e,Please Review Sunehre Ad Placement http://tinyurl.com/oow6mk,Please Re,1
...,...,...,...,...,...
16077,26977,dd02e1b8a8,"Allianz interview went well, got a rejection from MOD though .... going to be a long weekend as I hear from Allianz on Monday","went well,",1
16144,27091,6173fd7c2e,what a riot..now you can tell you`re friends you`ve got chlamydia..but you`re hoping to get pox-syphilis soon,hoping to get pox-syphilis soon,0
16187,27167,490da06e8f,i just looove my bf u are awesoome!!!! [hannah montana the movie was amazing best movie ever!!] // cool http://gykd.net,awesoome!,1
16332,27428,e02ea8a95c,i hate my presentation hahah whatever im glad its over,im glad,1
