#### Following Code using logestic regression to predict (positive or negative) the mood of the scentence. The model is trained on NLTK's internal twitter data. 

In [41]:
import nltk
from nltk.corpus import twitter_samples# sample Twitter dataset from NLTK
import matplotlib.pyplot as plt
import random
import numpy as np

import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.linear_model import LogisticRegression

In [42]:
# downloads sample twitter dataset.
#nltk.download('twitter_samples')
#nltk.download('stopwords')

In [43]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

print('Number of positive tweets: ', len(all_positive_tweets))
print('Number of negative tweets: ', len(all_negative_tweets))

Number of positive tweets:  5000
Number of negative tweets:  5000


In [44]:
### Seperating Train and test
train_pos,train_neg =all_positive_tweets[:4000],all_negative_tweets[:4000]
test_pos, test_neg=all_positive_tweets[4000:],all_negative_tweets[4000:]

y_train_pos,y_train_neg=[1]*len(train_pos),[0]*len(train_neg)
y_test_pos,y_test_neg=[1]*len(test_pos),[0]*len(test_neg)

## Train and Test Set
train_sen_X=train_pos+train_neg
train_sen_y=y_train_pos+y_train_neg
#random.Random(4).shuffle(train_sen_X)
#random.Random(4).shuffle(train_sen_y)

test_sen_X=test_pos+test_neg
test_sen_y=y_test_pos+y_test_neg

In [45]:
def preprocessing_tweets(scentences_list,stopwords=stopwords):
    '''Follwing operations are performed on the list of tweets fed into the function
    1. handles, hyperlinks and hashtags are removed
    2. Tokenization of words is done
    3. Stopwords and puncutation are removed

    OUTPUT: 
    '''
    stopwords_english = stopwords.words('english') 
    output=[]
    for scentence in scentences_list:
        scentence = re.sub(r'^RT[\s]+', '', scentence)
        scentence = re.sub(r'https?://[^\s\n\r]+', '', scentence)
        scentence = re.sub(r'#', '', scentence)
        scentence = re.sub(r'#', '', scentence)

        ## Creating a list of words in a particular scentence, making all small case
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
        token_scentence = tokenizer.tokenize(scentence)

        ## removing punctuation and stop words
        scentence_clean = []
        for word in token_scentence: # Go through every word in your tokens list
            if (word not in stopwords_english and word not in string.punctuation): # remove stopwords:  # remove punctuation
                scentence_clean.append(word)

        ## Stemming: (leran, learning, learned)
        stemmer = PorterStemmer()
        stemmed_scentence=[]
        for word in scentence_clean:
            stem_word = stemmer.stem(word)  # stemming word
            stemmed_scentence.append(stem_word)  # append to the list

        output.append(stemmed_scentence)

    return output

In [46]:
## Creating freq dictionary
def creating_freq_dict(preprocessed_scentences,y_tags=[1,1,1,1,1]):
    '''
    Preprocessed_scentemces are the list of list
    Output: dictionary for (word,y) as key and value as freq.
    '''
    di={}
    for i in range(len(preprocessed_scentences)):
        for word in preprocessed_scentences[i]:
            di[(word,y_tags[i])]=di.get((word,y_tags[i]),0)+1
    return di



In [47]:
### Vectorization of Cleaned Scentences
def vectorize_cleaned_scentences(freq_dict,preprocessed_scen ):
    '''Preprocessed_sen is output from preprocessing_tweets which is a list of list of words
    freq_dict: dictionary of freq'''
    
    X=np.zeros(shape=(len(preprocessed_scen),3))
    for i in range(len(preprocessed_scen)):
        for word in preprocessed_scen[i]:
            pos_freq=freq_dict.get((word,1),0)
            neg_freq=freq_dict.get((word,0),0)
        X[i]=np.array([1,pos_freq,neg_freq])
    return X
    

In [48]:
def training_logestic(train_sentences_X:list,train_sentences_tags:list,test_sentences_X:list, test_sentences_tags:list):
    '''
    train_sentences_X: List of sentences used in training
    train_sentences_tags: tags (pos or neg) for training sentences

    Output: Trained model, train_accurancy,test accuracy
    '''
    model=LogisticRegression(max_iter=1000)
    ## Preprocessing Tweets ##
    X_train=preprocessing_tweets(train_sentences_X)
    X_test=preprocessing_tweets(test_sentences_X)
    y_train=np.array(train_sentences_tags)
    y_test=np.array(test_sentences_tags)
    ##################################

    ## Vectorizing Train test sentences ####
    freqs_dict=creating_freq_dict(X_train,y_train)
    X_train=vectorize_cleaned_scentences(freqs_dict,X_train)
    X_test=vectorize_cleaned_scentences(freqs_dict,X_test)
    ###########################################

    ## Training ###
    model.fit(X_train,y_train)
    predictions_test=model.predict(X_test)
    predictions_train=model.predict(X_train)

    accuracy_train=np.sum(y_train==predictions_train)/len(y_train)
    accuracy_test=np.sum(y_test==predictions_test)/len(y_test)
    

    return model,accuracy_train,accuracy_test,freqs_dict


In [49]:
model, train_acc, test_acc,frequency_dict=training_logestic(train_sen_X,train_sen_y,test_sen_X,test_sen_y)
print(f'Training Accuracy is {train_acc}')
print(f'Test Accuracy is {test_acc}')

Training Accuracy is 0.896875
Test Accuracy is 0.882


##### Test your own scentences
In order to misclassify questionable scentences we have created a third class, Questionable, to prevent misclassify questionable scentences

In [58]:
def testing_custom_scentences(model=model,frequency_dict=frequency_dict):

    interpret_dict={1:'Positive',0:'Negative',0.5: 'Questionable'}
    color_dict={1:'\033[92m ', 0:'\033[91m ',0.5: '\033[94m'}
    bold='\033[1m'

    scentence=input('Enter Your Scentence:')
    print(bold+'Input Scentence: ',scentence)
    scentence=[scentence]
    X=preprocessing_tweets(scentence)
    X=vectorize_cleaned_scentences(frequency_dict,X)
    
    #result=model.predict(X)
    #print(result)
    
    proba= model.predict_proba(X)[0][0]
    if proba>0.6:
        result=0
    elif proba<0.4:
        result=1
    else:
        result=0.5
    
    result_statement=interpret_dict[result]
    print(f'{bold+color_dict[result]}The Scentence is predicted {result_statement}')
    print('Probability of prediction being negative is:', proba)
    return None


In [59]:
testing_custom_scentences()

[1mInput Scentence:  how was your day
[1m[92m The Scentence is predicted Positive
Probability of prediction being negative is: 0.3197354964014757


#### Ceavats to the model
-  The model is basically trained on frequency of word occurances, It doesn't take into account the words preceeding and trailing words.
-  Model is trained on tweets and tweets have a little different fornat than actual coversations,thus results my differ.