# Importing Libraries and Modules

In [144]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize,TweetTokenizer
from nltk import FreqDist, classify, NaiveBayesClassifier
import re, string, random
import io
import collections
from nltk.metrics import *
from nltk.corpus import wordnet

In [145]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Importing Dataset from My Drive


In [146]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [147]:
#Storing dataset into a variable 'df'
df = pd.read_csv(('/content/drive/My Drive/Twitter Dataset/Twitter.csv'),encoding='cp1252',names=['target','ids','date,time','flag','user','text'])
df.shape

(1600000, 6)

In [148]:
#Taking out Tweets and Target out from dataset 'df'
tweets = df['text']
target=df['target']


In [149]:
tweets

0          @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          is upset that he can't update his Facebook by ...
2          @Kenichan I dived many times for the ball. Man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com - Very cool to hear old Walt interv...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: text, Length: 1600000, dtype: object

In [150]:
#Converting Pandas series to Numpy array
tweets = df['text'].to_numpy()
target=df['target'].to_numpy()

In [151]:
tweets

array(["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
       "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
       '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
       ..., 'Are you ready for your MoJo Makeover? Ask me for details ',
       'Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur ',
       'happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H '],
      dtype=object)

In [152]:
#Shuffling data
from sklearn.utils import shuffle
tweets,target= shuffle(tweets,target, random_state=10)

## Cleaning

In [153]:
#Function to remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(string):
        new=string.lower().split()
        new = [word for word in new if not word in stopwords]
        new = ' '.join(new) 
        return new

Stemming

In [154]:
#Function to perform Stemming
ps = PorterStemmer()
def stem(string):
        new=string.lower().split()
        new = [ps.stem(word) for word in new]
        new = ' '.join(new)
        return new

Lemmatization

In [155]:
#Function to perform Lemmatization on a Corpus
lemmatizer = WordNetLemmatizer()
def lemmatize(tweets):

    def get_wordnet_pos(word):
        # Map POS tag to first character lemmatize() accepts
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lemmatized_tweets=[]

    for i in tweets:
        tweet = i
        tweet = tweet.lower()
        tweet = tweet.split()
        tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet]
        tweet = ' '.join(tweet) 
        lemmatized_tweets.append(tweet)
    return(lemmatized_tweets)

Removing Noise

In [156]:
def remove_noise(tweets):
    cleaned_tweets = []
    for i in tweets:
        tweet=i
        tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                    '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',tweet)
        tweet = re.sub("(@[A-Za-z0-9_]+)","",tweet)
        cleaned_tweets.append(tweet)
    return (cleaned_tweets)

Tokenization of corpus

In [157]:
def tokenize(tweets):
    tokenized_cleaned_tweets = [(tweet.split()) for tweet in tweets]
    return (tokenized_cleaned_tweets)

## PREPROCESSING

In [158]:
cleaned_tweets=[]
size=800000
tweets=tweets[0:size]
target=target[0:size]
#Removing stopwords and permorming stemming
for i in tweets:
  st=remove_stopwords(i)
  rs=stem(st)
  cleaned_tweets.append(rs)
l=lemmatize(cleaned_tweets)  #Lemmatization
corpus=tokenize(remove_noise(l)) #Removing noise and tokenizing the corpus

In [159]:
len(tweets)

800000

## Preparation of Model

In [160]:
def create_dictionary(tokenized_cleaned_corpus,target):
    def get_tweets_for_model(cleaned_tokens_list):
        for tweet_tokens in cleaned_tokens_list:
            yield dict([token, True] for token in tweet_tokens)

    tokens_for_model = get_tweets_for_model(tokenized_cleaned_corpus)

    final_corpus=[]
    j=0
    for i in tokens_for_model:
        final_corpus.append((i, "Positive" if target[j]==4 else "Negative"))
        j+=1

    return(final_corpus)

In [161]:
final_corpus=create_dictionary(corpus,target)

In [162]:
final_corpus[25000]

({'chair': True,
  'goober!': True,
  'love': True,
  'push': True,
  'stage': True,
  'u': True,
  'ur': True,
  'wheel': True,
  'ya': True},
 'Positive')

In [163]:
len(final_corpus)

800000

Training and Testing using Naive Bayes

In [164]:
train_data = final_corpus[:int(size*0.75)] #75% for training
test_data = final_corpus[int(size*0.75):] #25% for testing
print("Size of Training data is ",len(train_data))
print("Size of Testing data is ",len(test_data))

Size of Training data is  600000
Size of Testing data is  200000


In [165]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)
print(classifier.show_most_informative_features(10))

Most Informative Features
                  farrah = True           Negati : Positi =     49.6 : 1.0
                  sadden = True           Negati : Positi =     44.2 : 1.0
                 *cries* = True           Negati : Positi =     40.4 : 1.0
             heartbroken = True           Negati : Positi =     37.7 : 1.0
                  condol = True           Negati : Positi =     37.0 : 1.0
                   died! = True           Negati : Positi =     31.3 : 1.0
                 nauseou = True           Negati : Positi =     27.0 : 1.0
               heartburn = True           Negati : Positi =     25.0 : 1.0
              headache.. = True           Negati : Positi =     24.4 : 1.0
                  devast = True           Negati : Positi =     24.2 : 1.0
None


##Accuracy, Confusion Matrix, Precision, Recall, F1 score

In [166]:
new_test_data=corpus[int(size*0.75):]
new_test_data[0]


['prepar', 'nice', 'websit']

In [167]:
from sklearn.metrics import precision_recall_fscore_support

pred=classifier.classify_many((dict([token, True] for token in i)) for i in new_test_data)
true=[label[-1] for label in test_data]
matrix=ConfusionMatrix(true,pred)

y_pred=np.array(pred)
y_true=np.array(true)
ans=precision_recall_fscore_support(y_true, y_pred, average='macro')

In [168]:
print("Accuracy :", (classify.accuracy(classifier, test_data))*100,"%")

print("Confusion Matrix :")
print(matrix)

print("Precision : ",ans[0])
print("Recall : ",ans[1])
print("F1 score : ",ans[2])

Accuracy : 75.3465 %
Confusion Matrix :
         |     N     P |
         |     e     o |
         |     g     s |
         |     a     i |
         |     t     t |
         |     i     i |
         |     v     v |
         |     e     e |
---------+-------------+
Negative |<81767>18297 |
Positive | 31010<68926>|
---------+-------------+
(row = reference; col = test)

Precision :  0.7576299673613962
Recall :  0.7534242174025594
F1 score :  0.7524445064387022


In [169]:
import tweepy

In [170]:
%run ./keys.ipynb

ERROR:root:File `'./keys.ipynb.py'` not found.


In [171]:
consumer_key="ZK0X8p8TeX111DcmPQ7pkceAi"
consumer_secret="S1sIIER1K4c0chfSOV2Fv91xAtfdWF7RY2HpDbxCkYVsFgBSkB"
access_token="1137384737872535552-WAFL94Gg5vzeib9zbaYgvyz7IQAUkk"
access_token_secret="ipgS3MPSlacN7L1pYs6V2aEAPqOveFnUKYqpMPhGKv7Dc"

In [172]:
auth=tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api=tweepy.API(auth)

In [173]:
number_of_tweets=10
t=[]
likes=[]
for i in tweepy.Cursor(api.user_timeline, id="sachin_srt", tweet_mode="extended").items(number_of_tweets):
  t.append(i.full_text)

In [174]:
t=np.array(t)

In [175]:
cleaned_tweets_x=[]
for i in t:
  st_x=remove_stopwords(i)
  rs_x=stem(st_x)
  cleaned_tweets_x.append(rs_x)
l_x=lemmatize(cleaned_tweets_x)  #Lemmatization
corpus_x=tokenize(remove_noise(l_x))

In [176]:
corpus_x

[['look',
  'match',
  'ind',
  'v',
  'pak..aft',
  'long',
  'time',
  'match',
  '2',
  'countries...i',
  'hope',
  'india',
  'rock',
  'again....'],
 ['gud', '9t', 'all..'],
 ['hope',
  'like',
  'feel',
  'one',
  'year',
  'babi',
  'u',
  'thrown',
  'up...h',
  'laugh',
  "b'coz",
  'know',
  'u',
  'catch',
  'him...thi',
  'hope!!'],
 ['thought', 'day-how', 'take', 'noth', 'high', 'fli'],
 ['sehwag&gambhir',
  'drive',
  'india',
  'home',
  'comfortable..viru',
  'bowl',
  'well',
  'gauti',
  'do',
  'great',
  'job',
  'batting....but',
  'miss',
  'thunder',
  'viru!!!'],
 ['gud',
  '9t',
  'all..in',
  "today'",
  'match',
  "afridi'",
  'great',
  'knock',
  'doesnt',
  'work',
  'pak',
  'loo',
  '16',
  'runs..',
  'malinga',
  'bowl',
  'well',
  'today'],
 ['news',
  'frm',
  'away',
  'holiday',
  'kids.wont',
  'get',
  'holiday',
  'next',
  'year'],
 ['india',
  'win',
  'asia',
  'cup.....our',
  'side',
  'look',
  'good',
  'n',
  'balanc',
  'bat',
  'n',


In [177]:
ans=[]
for i in range(0,10):
  tesst= dict([token, True] for token in corpus_x[i])
  pred_x=classifier.classify(tesst)
  ans.append(pred_x)

In [178]:
t

array(['Looking for the match between ind vs pak..after a long time the match between these 2 countries...i hope india to rock again....',
       'Gud 9t all..',
       "Hope is like a feeling of a one year baby when u thrown him up...he laughs b'coz he knowns u will catch him...this is the hope!!",
       'Thought for the day-how you take off has nothing to do with how high you will fly',
       'Sehwag&gambhir drives india home comfortable..viru bowled well and gauti done great job in batting....but we miss the thunders from viru!!!',
       "Gud 9t all..in today's match afridi's great knock doesnt work for pak they loose by 16 runs.. Malinga bowled well today",
       'No news frm me for a while as i m away on holiday with my kids.wont get a holiday with them in next year',
       'India will win the asia cup.....our side looks good n balancing in both batting n bowling.....all the best team india make us proud!!',
       '@samalasam hi samalasam thanks for ur comments',
       '&lt

In [179]:
ans

['Negative',
 'Positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Positive',
 'Positive',
 'Negative']

Some samples

In [193]:
def prediction(s):
  cleaned=[]
  st_res=remove_stopwords(s)
  rs_res=stem(st_res)
  cleaned.append(rs_res)
  l_res=lemmatize(cleaned)  #Lemmatization
  corpus_res=tokenize(remove_noise(l_res))
  test= dict([token, True] for token in corpus_res[0])
  pred_x=classifier.classify(test)
  return pred_x

In [196]:
res=input("Enter a sentence! \n")
prediction(res)

Enter a sentence! 
I like beautiful sky


'Positive'

In [197]:
res=input("Enter a sentence! \n")
prediction(res)

Enter a sentence! 
I hate it when you are near me


'Negative'