In [2]:
import nltk
import random
import numpy as np
import re
import string
import pandas as pd
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords


Check what all files are there in the corpus

In [3]:
fileids=twitter_samples.fileids()

In [4]:
fileids

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [5]:
all_positive_tweets=twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')
tweets=all_negative_tweets+all_positive_tweets



check wheter the training set is balanced or not

In [6]:
print('The number of positive samples are : ' ,len(all_positive_tweets))
print('The number of negative samples are : ' ,len(all_negative_tweets))

The number of positive samples are :  5000
The number of negative samples are :  5000




Distributing the dataset into training and testing data

# 0. CREATING THE TRAINING AND TESTING DATASETS

In [7]:
test_pos=all_positive_tweets[4000:]
train_pos=all_positive_tweets[:4000]
test_neg=all_negative_tweets[4000:]
train_neg=all_negative_tweets[:4000]

train_x=train_pos+train_neg
test_x=test_pos+test_neg

In [8]:
train_y=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis=0)
test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0)

# 1. PREPROCESSING THE DATA

In [9]:
RTregex=r'^RT[/s]+'
hyperlinkregex=r'^https?:\/\/.*[\r\n]*'
stemmer=nltk.PorterStemmer()
def process_tweet(tweet):
    tweet=tweet.lower()           #converting all the words into lowercase
    tweet=re.sub(RTregex,'',tweet)    #removing all the RT signs
    tweet=re.sub(hyperlinkregex,'',tweet)  #removing all the hyperlinks
    tweet=re.sub(r'#','',tweet)   #removing all the hashtags
    tweet_tokens=tweet.split()
    tweet_tokens=[word for word in tweet_tokens if word not in stopwords.words('english')]
    tweet_tokens=[stemmer.stem(word) for word in tweet_tokens]
    return tweet_tokens
    

In [10]:
text='I am happy :)'
process_tweet(text)

['happi', ':)']

In [11]:
def build_freq(tweets,y):
    #tweets:List of all tweets
    #y:list of respective sentiments
    freqs={}
    y=np.squeeze(y).tolist()
    for tweet,ys in zip(tweets,y):
        for word in process_tweet(tweet):
            pair=(word , ys)
            if pair in freqs.keys():
                freqs[pair]+=1
            else:
                freqs[pair]=1
    return freqs

In [12]:
freqs=build_freq(train_x,train_y)

In [13]:
print(freqs.items())

dict_items([(('followfriday', 1.0), 23), (('@france_int', 1.0), 1), (('@pkuchly57', 1.0), 1), (('@milipol_pari', 1.0), 1), (('top', 1.0), 28), (('engag', 1.0), 7), (('member', 1.0), 13), (('commun', 1.0), 26), (('week', 1.0), 66), ((':)', 1.0), 2622), (('@lamb2ja', 1.0), 1), (('hey', 1.0), 50), (('james!', 1.0), 3), (('odd', 1.0), 2), ((':/', 1.0), 5), (('pleas', 1.0), 68), (('call', 1.0), 26), (('contact', 1.0), 4), (('centr', 1.0), 1), (('02392441234', 1.0), 1), (('abl', 1.0), 6), (('assist', 1.0), 1), (('mani', 1.0), 25), (('thanks!', 1.0), 15), (('@despiteoffici', 1.0), 1), (('listen', 1.0), 13), (('last', 1.0), 38), (('night', 1.0), 41), (('bleed', 1.0), 2), (('amaz', 1.0), 33), (('track.', 1.0), 1), (('scotland?!', 1.0), 1), (('@97side', 1.0), 1), (('congrat', 1.0), 13), (('yeaaaah', 1.0), 1), (('yippppy!!!', 1.0), 1), (('accnt', 1.0), 1), (('verifi', 1.0), 2), (('rqst', 1.0), 1), (('succeed', 1.0), 1), (('got', 1.0), 57), (('blue', 1.0), 8), (('tick', 1.0), 1), (('mark', 1.0), 1

# 2. TRAINING A MODEL

# 2.1 Logistic Regression From Scratch

In [14]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [15]:
def gradientdescent(x,y,theta,alpha,num_iters):
    #x-> matrix of features (m,n+1)  here is n is 2 ie = pos,neg
    #y-> corresponding label of the input matrix (m,1)
    #theta->weight vector(n+1,1)
    #alpha->learning rate
    #num_iters->number of iterations
    #OUTPUT: 1) Final Cost , 2) Final weight matrix thetha 
    m=x.shape[0]
    for i in range(num_iters):
        z=np.dot(x,theta)
        h=sigmoid(z)
        J=(-1/m)*(np.dot(y.T,np.log(h))+np.dot((1-y).T,np.log(1-h)))
        theta-=(alpha/m)*(np.dot(x.T,(h-y)))
    J=float(J)
    return J,theta

In [16]:
def extract_features(tweet,freqs):
    #tweet->a single tweet
    #freqs -> frequnecy dictionary which stroes the pair (word,label):freq as items
    #OUTPUT : x->feature vector of dimension (1,3)
    
    x=np.zeros((1,3))
    word_l=process_tweet(tweet)
    x[0,0]=1   #bias term is set to 1
    for word in word_l:
        x[0,1]+=freqs.get((word,1),0)
        
        x[0,2]+=freqs.get((word,0),0)
        
    return x        

In [17]:
#TRAINING YOUR MODEL 
X=np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i,:]=extract_features(train_x[i],freqs)
Y=train_y


In [18]:
cost, theta =gradientdescent(X,Y,np.zeros((3,1)),1e-9,30000)
print("The final cost using the gradient descent function is : ", cost)

The final cost using the gradient descent function is :  0.1487542759012061


In [19]:
#TEST YOUR MODEL 
def predict_tweet(tweet,freqs,theta):
    x=extract_features(tweet,freqs)
    z=np.dot(x,theta)
    z=sigmoid(z)
    return z
    

In [20]:
tweets=['great','great great','great great great','greta great great great great ']
for tweet in tweets:
    print(predict_tweet(tweet,freqs,theta))

[[0.5498807]]
[[0.59877826]]
[[0.64578569]]
[[0.69013575]]


In [21]:
count=0
for i in range(len(test_x)):
    prediction=predict_tweet(test_x[i],freqs,theta)
    if prediction>=0.5:
        ans=1
    else:
        ans=0
    if ans==test_y[i]:
        count+=1
accuracy=count/len(test_x)
print('The accuracy of the model is : ',accuracy)


The accuracy of the model is :  0.9725


In [22]:
text=':)'
predict_tweet(text,freqs,theta)

array([[0.99285266]])

In [48]:
texts=['neutral','i am so happy :) ','i hate being around them !! Its too awkward','i did well on this exam','i am liking machine learning ']
for text in texts:
    print(text,'---->',predict_tweet(text,freqs,theta))

neutral ----> [[0.50000006]]
i am so happy :)  ----> [[0.99446595]]
i hate being around them !! Its too awkward ----> [[0.48967048]]
i did well on this exam ----> [[0.50455964]]
i am liking machine learning  ----> [[0.51442128]]


# 2.2 Logistic Regression using scikit learn

In [39]:
X_test=np.zeros((len(test_x),3))
for i in range(len(test_x)):
    X_test[i,:]=extract_features(test_x[i],freqs)
Y_test=test_y

In [47]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(C=0.0001)
lr.fit(X,Y.ravel())
pred=lr.predict(X_test).reshape(-1,1)
print('The accuracy is : ',np.mean(pred==Y_test))

The accuracy is :  0.9455
