https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import torch
import pandas as pd
from tqdm.notebook import tqdm
import re, string, random

In [4]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

#        if tag.startswith("NN"):
#            pos = 'n'
#        elif tag.startswith('VB'):
#            pos = 'v'
#        else:
#            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [5]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\benha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
import string
from nltk.corpus import stopwords

In [262]:
df= pd.read_csv('C:\\Users\\benha\\OneDrive\\Desktop\\Utahlaketweets\\utahlake_2016.csv', engine='python',
       names = ['id','text','time', 'utahlake', 'waterquality', 'positive','current'],)

In [264]:
if __name__ == "__main__":
    df= pd.read_csv(
        'C:\\Users\\benha\\OneDrive\\Desktop\\Utahlaketweets\\utahlake_2016.csv', engine='python',
        names = ['id','text','time', 'utahlake', 'waterquality', 'positive','current'],)
    repetitive = ['Utah Lake', 'UtahLake', 'Utah lake', 'Utahlake', 'utahlake', '&amp', 'utah', 'lake',
                  'Jordan Lake', 'Jordan lake',"n't","s't",'1','2','3','4','5','6','7','8','9','0',"'re","'d","cuz","``","--"]
    for i in repetitive:  
        df['text'] = df['text'].str.replace(i, '')
    for i in string.punctuation:
       df['text'] = df['text'].str.replace(i, ' ')
    #df = df[1:]
    df=df[df.utahlake!='0']
    df= df[df['text'].str[:2]!='RT']
    df = df.fillna('-1')
    positive = df['text'][df['waterquality']=='Yes'].tolist()
    negative = df['text'][df['waterquality']=='No'].tolist()
    #neutral = df['text'][df['waterquality']=='Not sure'].tolist()
    
    stop_words = stopwords.words('english')
    
    positive_tokens = [word_tokenize(i) for i in positive]
    negative_tokens = [word_tokenize(i) for i in negative]
    #neutral_tokens = [word_tokenize(i) for i in neutral]

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []
    #neutral_cleaned_tokens_list = []

    for tokens in positive_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words,))

    for tokens in negative_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    #for tokens in neutral_tokens:
    #    neutral_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
        
    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
    #neutral_tokens_for_model = get_tweets_for_model(neutral_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Water_quality")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Non_water_quality")
                         for tweet_dict in negative_tokens_for_model]
    #neutral_dataset = [(tweet_dict, "Neutral")
    #                     for tweet_dict in neutral_tokens_for_model]

    dataset=positive_dataset+negative_dataset#+neutral_dataset

    random.shuffle(dataset)

    train_data = dataset[:1800]
    test_data = dataset[1800:]

    classifier_water = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier_water, test_data))

    print(classifier_water.show_most_informative_features(20))

[('algae', 143), ('water', 127), ('bloom', 120), ('toxic', 80), ('utah', 54), ('algal', 50), ('clean', 47), ('get', 46), ('people', 41), ('like', 39)]
Accuracy is: 0.8222222222222222
Most Informative Features
                   bloom = True           Water_ : Non_wa =    228.8 : 1.0
                   nasty = True           Water_ : Non_wa =     30.3 : 1.0
                 utahdeq = True           Water_ : Non_wa =     19.5 : 1.0
                   clean = True           Water_ : Non_wa =     15.9 : 1.0
                  health = True           Water_ : Non_wa =     13.8 : 1.0
                  closed = True           Water_ : Non_wa =     13.3 : 1.0
                    ncga = True           Water_ : Non_wa =     13.3 : 1.0
           ncpolicywatch = True           Water_ : Non_wa =     13.3 : 1.0
                    uchd = True           Water_ : Non_wa =     13.3 : 1.0
                  affect = True           Water_ : Non_wa =     12.4 : 1.0
                   avoid = True          

In [12]:
if __name__ == "__main__":
    df= pd.read_csv(
        'C:\\Users\\benha\\OneDrive\\Desktop\\Utahlaketweets\\utahlake_2016.csv', engine='python',
        names = ['id','text','time', 'utahlake', 'waterquality', 'positive','current'],)
    repetitive = ['Utah Lake', 'UtahLake', 'Utah lake', 'Utahlake', 'utah lake', 'utahlake', '&amp',
                  'Jordan Lake', 'Jordan lake',"n't","s't",'utah','1','2','3','4','5','6','7','8','9','0',"'re","'d","cuz","``","--"]
    for i in repetitive:  
        df['text'] = df['text'].str.replace(i, '')
    for i in string.punctuation:
       df['text'] = df['text'].str.replace(i, ' ')
    #df = df[1:]
    df=df[df.utahlake!='0']
    df= df[df['text'].str[:2]!='RT']
    df = df.fillna('-1')
    positive = df['text'][df['positive']=='Positive'].tolist()
    negative = df['text'][df['positive']=='Negative'].tolist()
    #neutral = df['text'][df['positive']=='Neutral'].tolist()
    
    stop_words = stopwords.words('english')
    
    positive_tokens = [word_tokenize(i) for i in positive]
    negative_tokens = [word_tokenize(i) for i in negative]
   # neutral_tokens = [word_tokenize(i) for i in neutral]

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []
   # neutral_cleaned_tokens_list = []

    for tokens in positive_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    #for tokens in neutral_tokens:
     #   neutral_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
        
    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
    #neutral_tokens_for_model = get_tweets_for_model(neutral_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]
    #neutral_dataset = [(tweet_dict, "Neutral")
                      #   for tweet_dict in neutral_tokens_for_model]

    dataset=positive_dataset+negative_dataset#+neutral_dataset

    random.shuffle(dataset)

    train_data = dataset[:1900]
    test_data = dataset[1900:]

    classifier_positive = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier_positive, test_data))

    print(classifier_positive.show_most_informative_features(20))
     

[('go', 83), ('love', 81), ('good', 79), ('get', 69), ('day', 58), ('like', 52), ('great', 50), ('quot', 50), ('one', 47), ('time', 47)]
Accuracy is: 0.7022900763358778
Most Informative Features
                   algae = True           Negati : Positi =     19.6 : 1.0
                   thank = True           Positi : Negati =     16.2 : 1.0
                     due = True           Negati : Positi =     12.7 : 1.0
                   bloom = True           Negati : Positi =     11.4 : 1.0
                   green = True           Negati : Positi =     10.3 : 1.0
                   cause = True           Negati : Positi =      9.8 : 1.0
                     sad = True           Negati : Positi =      9.0 : 1.0
               goodnight = True           Positi : Negati =      8.8 : 1.0
                    swim = True           Negati : Positi =      7.9 : 1.0
                    dump = True           Negati : Positi =      6.5 : 1.0
                    fire = True           Negati : Posi

In [107]:
df

Unnamed: 0,id,text,time,utahlake,waterquality,positive,current
4,4,The real Freak Lake as envisioned while I wro...,12/31/2016 4:27,1,Yes,Negative,-1
5,5,fav to come iceskating on ⛸,12/29/2016 20:51,1,No,Positive,-1
6,6,KDMA let s go run into and freeze to deat...,12/22/2016 1:52,1,No,Neutral,1
7,7,rcfullmer Tsebresos DexFenik Ew is so nasty,12/21/2016 20:32,1,Yes,Negative,1
8,8,WindWolfArt Tsebresos DexFenik Or if you ar...,12/21/2016 20:21,1,No,Positive,0
...,...,...,...,...,...,...,...
1392,-1,Interior BlueRidgeNPS SecretaryZinke “ B...,-1,1,No,Positive,-1
1395,-1,TriangleBiways Great time capsule The Sea...,-1,1,No,Neutral,-1
1396,-1,“A life worth living is one of compassion And...,-1,1,No,Positive,-1
1397,-1,After yays story about PFOAs PFAs etc in Ca...,-1,1,Yes,Negative,-1


In [221]:
###PREDICTION
dataset= pd.read_csv(
        'C:\\Users\\benha\\OneDrive\\Desktop\\Utahlaketweets\\utahlake_2016_links.csv', 
        names = ['id','location','text','time', 'waterquality', 'positive','current'])
dataset = dataset[1:]
dataset


Unnamed: 0,id,location,text,time,waterquality,positive,current
1.0,2,"Provo, UT","No matter how you fish at Utah Lake, we hope ...",12/31/2016 21:45,,FALSE,
2.0,3,"Provo, UT",What's YOUR favorite activity on the ice of Ut...,12/31/2016 18:20,,FALSE,
3.0,4,,"Sunset at Utah Lake, 5 minutes drive from my h...",12/31/2016 1:26,"{'type': 'Point', 'coordinates': [-111.735, 40...",FALSE,Place(_api=<tweepy.api.API object at 0x000001F...
4.0,5,"Utah, USA",This is in our back yard! Toxic algae in Utah ...,12/31/2016 0:50,,FALSE,
5.0,6,"Utah, USA",This is in our back yard! Toxic algae in Utah ...,12/31/2016 0:50,,FALSE,
...,...,...,...,...,...,...,...
6805.0,6806,"Provo, UT",Fav to come to a insta meet I just planned w a...,1/2/2016 7:12,,FALSE,Place(_api=<tweepy.api.API object at 0x000001F...
6806.0,6807,,"Thanks to the Jenne and Camp families, this ic...",1/1/2016 20:19,,FALSE,
6807.0,6808,,RT @GarrettElls: Yesterday was fun too bad the...,1/1/2016 19:39,,FALSE,
6808.0,6809,,Yesterday was fun too bad the Utah lake ones d...,1/1/2016 19:38,,FALSE,


In [222]:
from nltk.tokenize import word_tokenize
WATER_pre=[]
POS_pre=[]
for i in range(1,len(dataset)+1):
    tweet = dataset.text[i]
    tokens = remove_noise(word_tokenize(tweet))
    WATER_pre.append(classifier_water.classify(dict([token, True] for token in tokens)))
    POS_pre.append(classifier_positive.classify(dict([token, True] for token in tokens)))
dataset['water_prediction']= WATER_pre
dataset['pos_prediction']=POS_pre

In [223]:
dataset

Unnamed: 0,id,location,text,time,waterquality,positive,current,water_prediction,pos_prediction
1.0,2,"Provo, UT","No matter how you fish at Utah Lake, we hope ...",12/31/2016 21:45,,FALSE,,Non_water_quality,Negative
2.0,3,"Provo, UT",What's YOUR favorite activity on the ice of Ut...,12/31/2016 18:20,,FALSE,,Non_water_quality,Neutral
3.0,4,,"Sunset at Utah Lake, 5 minutes drive from my h...",12/31/2016 1:26,"{'type': 'Point', 'coordinates': [-111.735, 40...",FALSE,Place(_api=<tweepy.api.API object at 0x000001F...,Non_water_quality,Neutral
4.0,5,"Utah, USA",This is in our back yard! Toxic algae in Utah ...,12/31/2016 0:50,,FALSE,,Water_quality,Negative
5.0,6,"Utah, USA",This is in our back yard! Toxic algae in Utah ...,12/31/2016 0:50,,FALSE,,Water_quality,Negative
...,...,...,...,...,...,...,...,...,...
6805.0,6806,"Provo, UT",Fav to come to a insta meet I just planned w a...,1/2/2016 7:12,,FALSE,Place(_api=<tweepy.api.API object at 0x000001F...,Non_water_quality,Positive
6806.0,6807,,"Thanks to the Jenne and Camp families, this ic...",1/1/2016 20:19,,FALSE,,Non_water_quality,Positive
6807.0,6808,,RT @GarrettElls: Yesterday was fun too bad the...,1/1/2016 19:39,,FALSE,,Water_quality,Negative
6808.0,6809,,Yesterday was fun too bad the Utah lake ones d...,1/1/2016 19:38,,FALSE,,Water_quality,Negative
