In [1]:
import pandas as pd
train_df = pd.read_csv('./train_90.txt', skiprows = 1, names= ["ItemID", "Sentiment", "SentimentSource", "SentimentText"])

test_df = pd.read_csv('./test_10.txt', skiprows = 1, names= ["ItemID", "Sentiment", "SentimentSource", "SentimentText"])

In [2]:
import numpy as np
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

from nltk.stem.porter import *
stemmer = PorterStemmer()

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

def clean_data(text):
    text['tidy_tweet'] = np.vectorize(remove_pattern)(text['SentimentText'], "@[\w]*")
    text['tidy_tweet'] = text['tidy_tweet'].apply(lambda x: x.lower())
    text['tidy_tweet'] = text['tidy_tweet'].str.replace("[^a-z0-9#]", " ")
    text['tokenized_tweet'] = text['tidy_tweet'].apply(lambda x: x.split())
    text['tokenized_tweet'] = text['tokenized_tweet'].apply(lambda x: [i for i in x if i not in stop])
    text['tokenized_tweet'] = text['tokenized_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
    text['tokenized_tweet'] = text['tokenized_tweet'].apply(lambda x: ' '.join([w for w in x]))
    return text
    

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawanjeetkaur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
cleaned_data = clean_data(train_df)
cleaned_data_test = clean_data(test_df)
y_test = cleaned_data_test['Sentiment']

In [4]:
#print(train_df["SentimentText"][89985])
print(cleaned_data['tokenized_tweet'])

0                                           sad apl friend
1                                    miss new moon trailer
2                                         omg alreadi 7 30
3        omgaga im sooo im gunna cri dentist sinc 11 su...
4                                        think mi bf cheat
                               ...                        
89984    gnome hat problem finish size pointi top mama ...
89985    saw linn bakeri thought veggi friendli worri f...
89986                                           would love
89987                                                 evid
89988    spine thing sound good back exercis fun best l...
Name: tokenized_tweet, Length: 89989, dtype: object


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=10, stop_words='english')

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(cleaned_data['tokenized_tweet']).toarray()

tfidf_test = tfidf_vectorizer.transform(cleaned_data_test['tokenized_tweet']).toarray()

In [19]:
tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
temp = tfidf_vectorizer.vocabulary_

In [7]:
cleaned_data

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,tidy_tweet,tokenized_tweet
0,1,0,Sentiment140,is so sad for my APL frie...,is so sad for my apl frie...,sad apl friend
1,2,0,Sentiment140,I missed the New Moon trail...,i missed the new moon trail...,miss new moon trailer
2,3,1,Sentiment140,omg its already 7:30 :O,omg its already 7 30 o,omg alreadi 7 30
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cry i ...,omgaga im sooo im gunna cri dentist sinc 11 su...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me ...,think mi bf cheat
...,...,...,...,...,...,...
89984,89996,1,Sentiment140,@clevercatsknit Re: gnome hat. Was the problem...,re gnome hat was the problem the finished s...,gnome hat problem finish size pointi top mama ...
89985,89997,1,Sentiment140,@clevercatsknit Saw Linnes Bakery but thought ...,saw linnes bakery but thought it not too vegg...,saw linn bakeri thought veggi friendli worri f...
89986,89998,1,Sentiment140,@cleverdaisies I would LOVE to!!!,i would love to,would love
89987,89999,0,Sentiment140,@cleverick evidently not,evidently not,evid


In [8]:
X = tfidf
y = cleaned_data['Sentiment']

In [9]:
def sigmoid(X, weight):
    z = np.dot(X, weight)
    return 1 / (1 + np.exp(-z))

def gradient_descent(X, h, y):
    return np.dot(X.T, (h - y)) / y.shape[0]

def update_weight_loss(weight, learning_rate, gradient):
    return weight - learning_rate * gradient

def predict_t(x, theta):
    theta_1 = theta[:, np.newaxis]
    return sigmoid(x,theta_1)

def cal_acc(actual, pred):
    predicted_class = ((pred >= 0.5) .astype(int))
    predicted_class = predicted_class.flatten()
    acc = np.mean(predicted_class == actual)
    return acc


In [10]:
def run_grad(X, y):
    num_iter = 100
   
    theta = np.zeros(X.shape[1])
 
    for i in range(num_iter):
        h = sigmoid(X, theta)
        gradient = gradient_descent(X, h, y)
        theta = update_weight_loss(theta, 0.1, gradient)
    return theta

In [11]:
from numpy import array
from sklearn.model_selection import KFold

# Prepare cross validation
kfold = KFold(10, True, 1)
bestaccuracy = 0
theta_final = np.zeros(X.shape[1])

# Enumerate splits
for train, test in kfold.split(X):
    X_train = X[train]
    X_validate = X[test]
    
    Y_train = y[train]
    Y_validate = y[test]
    
    theta_out = run_grad(X_train, Y_train)
    
    pred = predict_t(X_validate, theta_out)
    
    acc = cal_acc(Y_validate, pred)
    
    if(acc > bestaccuracy):
        theta_final = theta_out
        bestaccuracy = acc

In [12]:
print(theta_final)
print(bestaccuracy)

[1.17983528e-04 3.96188669e-04 9.77293871e-05 ... 7.12300904e-05
 5.67245301e-05 4.92836337e-04]
0.7033003667074119


In [13]:
test_out = predict_t(tfidf_test, theta_final)

(10000, 4803)
(89989, 4803)


In [14]:
acc = cal_acc(y_test, test_out)

In [15]:
acc

0.705

In [21]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix


average_precision = average_precision_score(y_test, test_out)
test_final_out = ((test_out >= 0.5) .astype(int))

prec = precision_score(y_test, test_final_out)
recall = recall_score(y_test, test_final_out)
conf = confusion_matrix(y_test, test_final_out)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))

print('Precision score: {0:0.2f}'.format(prec))

print('Recall score: {0:0.2f}'.format(recall))

0       1
1       1
2       1
3       1
4       0
       ..
9995    0
9996    1
9997    0
9998    1
9999    1
Name: Sentiment, Length: 10000, dtype: int64
[[0.50056451]
 [0.50011448]
 [0.50366968]
 ...
 [0.50135158]
 [0.50520769]
 [0.50587136]]
Average precision-recall score: 0.81
Precision score: 0.70
Recall score: 0.89
