In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stpwords = stopwords.words('english')
import re
from gensim.models import Word2Vec
import os
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [38]:
# Reading all data and test ids
all_data = pd.read_csv('alldata.tsv', sep='\t')
testIDs = pd.read_csv('project3_splits.csv')

In [39]:
# Helper function for cleaning the text
def clean_corpus(text):
    '''
    INPUT
    text - string
    OUTPUT
    clean text
    This function processes the input using the following steps :
    1. Remove punctuation characters
    2. Remove stop words
    '''
    # Remove punctuation characters and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenize text
    tokens = word_tokenize(text)
        
    clean_text = ''
    for word in tokens:
        clean_tok = word.lower().strip()
        if clean_tok not in stpwords:
            clean_text += f'{clean_tok} '

    return clean_text

In [40]:
# Applying the cleaning column on the dataset
# all_data['clean_text'] = all_data['review'].apply(clean_corpus)

In [41]:
all_data.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [42]:
all_data[all_data.sentiment==1]['sentiment']

0        1
4        1
5        1
9        1
11       1
        ..
49987    1
49988    1
49989    1
49990    1
49999    1
Name: sentiment, Length: 25000, dtype: int64

In [43]:
# select the set of positive and negative tweets
all_positive = all_data[all_data.sentiment==1]
all_negative = all_data[all_data.sentiment==0]
all_positive = all_positive.reset_index()
all_negative = all_negative.reset_index()

In [59]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive[20000:]
train_pos = all_positive
test_neg = all_negative[20000:]
train_neg = all_negative

train_x = train_pos.append(train_neg) 
test_x = test_pos.append(test_neg)

test_x = test_x.reset_index()
train_x = train_x.reset_index()

In [60]:
train_x.head(2)

Unnamed: 0,level_0,index,id,sentiment,score,review
0,0,0,1,1,10,Naturally in a film who's main themes are of m...
1,1,4,5,1,7,A very accurate depiction of small time mob li...


In [61]:
test_pos = all_positive[20000:]
train_pos = all_positive
test_neg = all_negative[20000:]
train_neg = all_negative

train_y = train_pos.append(train_neg)['sentiment'] 
test_y = test_pos.append(test_neg)['sentiment'] 

# train_y = train_y.reset_index()
# test_y = test_y.reset_index()

In [62]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((50000, 6), (50000,), (10000, 6), (10000,))

In [63]:
from nltk.stem import PorterStemmer
import string

def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
#     tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
#                                reduce_len=True)
    tweet_tokens= word_tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [None]:
# create frequency dictionary
freqs = build_freqs(train_x['review'], train_y)

In [None]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

In [None]:
# Check your function

# test 1
# test on training data
tmp1 = extract_features(train_x['review'][0], freqs)
print(tmp1)

In [None]:
# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

In [None]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x['review'][i], freqs)

# training labels corresponding to X
Y = train_y

In [None]:
# collect the features 'x' and stack them into a matrix 'X'
X_test = np.zeros((len(test_x), 3))
for i in range(len(test_x)):
    X_test[i, :]= extract_features(test_x['review'][i], freqs)

# training labels corresponding to X
Y_test = test_y

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(X, Y)
predictions = clf.predict(X_test)

print(str(i+1)," : ",roc_auc_score(predictions,np.array(Y_test)))