# Import Tweet Data

In [1]:
import pandas as pd
import pickle

with open("TwitterRawData.dat", "rb") as filePath:
    TwitterData = pd.DataFrame(pickle.load(file=filePath))[["id", "full_text"]]

In [2]:
TwitterData.head()

Unnamed: 0,id,full_text
0,1220957331149557765,i guess its time to switch majors. data scienc...
1,1220955374867701761,#TechnoCool: Data Science Community Rocked by ...
2,1220954168057389056,Confused about how data science and data analy...
3,1220953376189366272,Creating Robust Python Workflows: Learn to dev...
4,1220952323167440896,National Level Seminar on COMPUTATIONAL MATHEM...


In [3]:
TwitterData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 2 columns):
id           1094 non-null int64
full_text    1094 non-null object
dtypes: int64(1), object(1)
memory usage: 17.2+ KB


In [4]:
TwitterData["full_text"].apply(len).describe()

count    1094.000000
mean      178.421389
std        78.420762
min        23.000000
25%       111.250000
50%       168.000000
75%       255.000000
max       319.000000
Name: full_text, dtype: float64

# Clean Data

In [5]:
import html

# Convert html encoded special characters to usable format
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: html.unescape(x))

In [6]:
import re
# Drop URI's completely.
TwitterData["full_text"] = TwitterData["full_text"].apply(
    lambda x: re.sub(string = x, pattern = "https\:\/\/[\w]+[.]?[\w]+?[\/\w]+\/*", repl = ""))

In [7]:
# Extract hash tags to split later into constituent words
TwitterData["HashTags"] = TwitterData["full_text"].apply(lambda x: re.findall(string = x, pattern = r"\#\w+\b"))

In [8]:
# Converts hashtags to plain words for later processing.
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: re.sub(string = x, pattern = "[\#]*", repl = ""))

In [9]:
# Remove emails and @user
TwitterData["full_text"] = TwitterData["full_text"].apply(
    lambda x: re.sub(string = x, pattern = "\b?[a-zA-Z0-9\.\_\%\+\-]*@[a-zA-Z0-9\.\-\_]+\b?", repl = ""))

In [10]:
# Set all text to lowercase to simply processing
TwitterData["full_text"] = TwitterData["full_text"].str.lower()

In [11]:
display(TwitterData.head())

Unnamed: 0,id,full_text,HashTags
0,1220957331149557765,i guess its time to switch majors. data scienc...,[]
1,1220955374867701761,technocool: data science community rocked by p...,"[#TechnoCool, #tech, #technology, #datascience..."
2,1220954168057389056,confused about how data science and data analy...,"[#CareerKarma, #breakintotech, #21DayCkChallenge]"
3,1220953376189366272,creating robust python workflows: learn to dev...,[#DataScience]
4,1220952323167440896,national level seminar on computational mathem...,[]


In [12]:
display(TwitterData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id           1094 non-null int64
full_text    1094 non-null object
HashTags     1094 non-null object
dtypes: int64(1), object(2)
memory usage: 25.8+ KB


None

#### Extract and normalize contractions and abbreviations

#### Correct mispellings

# Calculate Sentiment

In [13]:
# Installing nltk modules
# import nltk
# nltk.download("punkt")
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')
# nltk.download("wordnet")
# nltk.help.upenn_tagset()

## Create Reference Sentiment
>Using TextBlob built-in

In [14]:
#!pip install --trusted-host pypi.python.org textblob

def GetTextBlobSentiments(TwitterData):
    from textblob import TextBlob
    import pandas as pd

    sentimentData = pd.concat([
        TwitterData["id"],
        pd.DataFrame(
            columns = ["TextBlobPolarity", "TextBlobSentiment"],
            data = [TextBlob(x).sentiment for x in TwitterData["full_text"]],
        )
    ], axis = 1)
    return sentimentData

In [15]:
TwitterSentimentData = GetTextBlobSentiments(TwitterData)

In [16]:
TwitterSentimentData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id                   1094 non-null int64
TextBlobPolarity     1094 non-null float64
TextBlobSentiment    1094 non-null float64
dtypes: float64(2), int64(1)
memory usage: 25.8 KB


In [17]:
TwitterSentimentData.head(10)

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment
0,1220957331149557765,0.0,0.0
1,1220955374867701761,0.5,0.5
2,1220954168057389056,0.05625,0.55
3,1220953376189366272,0.0,0.0
4,1220952323167440896,-0.6,1.0
5,1220950780783415298,0.4,0.9
6,1220950425932726272,-0.4,0.7
7,1220949449528291329,0.0,0.0
8,1220949247249412096,0.268651,0.40377
9,1220948833565175808,0.5,0.5


In [18]:
# Bucket Textblob polarity for classifier
TwitterSentimentData["TextBlobPolarity(Bucketed)"] = TwitterSentimentData["TextBlobPolarity"].apply(lambda x: -1.0 if(x < -0.3) else 0.0 if (x < 0.3) else 1.0)

In [19]:
TwitterSentimentData.describe()

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment,TextBlobPolarity(Bucketed)
count,1094.0,1094.0,1094.0,1094.0
mean,1.220764e+18,0.145139,0.311635,0.213894
std,90655430000000.0,0.248656,0.291836,0.452652
min,1.220597e+18,-0.75,0.0,-1.0
25%,1.220699e+18,0.0,0.0,0.0
50%,1.220758e+18,0.0,0.3,0.0
75%,1.220825e+18,0.260691,0.5,0.0
max,1.220957e+18,1.0,1.0,1.0


## Create sentiment data by training with pre-labeled text corpus

### Generate word counts for predictions

#### Load Sentiment Training Dataset from file

In [20]:
def LoadYelpReviewData():
    import pandas as pd
    return pd.read_csv("YelpReviewData.csv", dtype = {"StarRating": "int8", "ReviewText":"str"})

In [21]:
TrainingData = LoadYelpReviewData()

In [22]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Data columns (total 2 columns):
StarRating    31157 non-null int8
ReviewText    31157 non-null object
dtypes: int8(1), object(1)
memory usage: 274.0+ KB


In [23]:
TrainingData["StarRating"].value_counts()

5    7245
4    7245
1    7245
3    5467
2    3955
Name: StarRating, dtype: int64

#### Expand contractions and abbreviations

#### Tokenize words and tag parts of speech.

#### Drop undesirable words

#### Lemmatize adjectives, words, nouns

In [24]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader import wordnet
from nltk.util import ngrams

LemmatizerEngine = WordNetLemmatizer()

POSTagToLemmaTag_Dict = {
    "J" : wordnet.ADJ,
    "N" : wordnet.NOUN,
    "V" : wordnet.VERB,
    "R" : wordnet.ADV,
}

def FilterForKeyWords(TextString):
    removeWords_List = list(set([
        # Prepositions
        "of", "with", "without", "at", "from", "into", "during", "including", "until", "against", "through", "throughput",
        "towards", "to", "upon", "concerning", "in", "out", "for", "on", "below", "by", "over", "under", "despite",
        "before", "after", "between", "since", "among", "along", "following", "across", "behind", "beyond", "except",
        "but", "up", "down", "aboard", "amid", "as", "behind", "considering", "during", "inside", "minus", "off", "per",
        "versus", "via",
    ]))
    alphaCheck = re.compile(r"^[a-z]+$")

    return str([LemmatizerEngine.lemmatize(word, POSTagToLemmaTag_Dict[pos[0]]) 
                for (word, pos) in nltk.pos_tag(nltk.word_tokenize(TextString.lower()))
            if (
                (len(word) > 1)
                & (alphaCheck.match(word) != None)
                & (word not in removeWords_List)
                & (pos[0] in [
                    "J",#"JJ", "JJR", "JJS", # Adjectives
                    #"N",#"NN", "NNS", "NNP", "NNPS", # Nouns
                    "R",#"RB", "RBR", "RBS", # Adverbs
                    "V",#"VB", "VBD", "VBG", "VBN", "VBP", "VBZ", #Verbs
                    ])
               )
               ])

def GetPrincipalWordCounts(DataFrame, TextColumnName, MinFreq = 2):
    from datetime import datetime # For debugging performance data

    Vectorizer = CountVectorizer(lowercase = False, strip_accents = "ascii", preprocessor = FilterForKeyWords,
                                 min_df = MinFreq, ngram_range = (1, 3),
                                )
    startTime = datetime.now() # For debugging performance data
    print("Starting Word Extraction at " + str(startTime))

    # Filter out unwanted words in each row, then create count columns for remaining words 
    WordCounts = pd.DataFrame(
        Vectorizer.fit_transform(DataFrame[TextColumnName]).toarray(), 
        columns=Vectorizer.get_feature_names(), 
        dtype = "uint",
    )

    print("Execution Time: " + str(datetime.now() - startTime)) # For debugging performance data

    return WordCounts

In [25]:
TrainingDataWordCounts = GetPrincipalWordCounts(TrainingData, "ReviewText", MinFreq = 0.005)
display(TrainingDataWordCounts.info())

Starting Word Extraction at 2020-02-01 03:08:56.325881
Execution Time: 0:03:07.965834
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Columns: 1501 entries, able to yummy
dtypes: uint32(1501)
memory usage: 178.4 MB


None

In [26]:
TrainingDataWordCounts.head()

Unnamed: 0,able,able get,about,absolutely,accept,accommodate,acknowledge,act,actual,actually,...,worth,wrap,write,wrong,wrong be,yell,yes,yet,young,yummy
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
TrainingDataWordCounts.sum()

able          1157
able get       177
about          737
absolutely    1138
accept         212
              ... 
yell           199
yes            184
yet            944
young          480
yummy          391
Length: 1501, dtype: int64

### Match word count columns from Twitter data

In [28]:
TwitterDataWordCounts = GetPrincipalWordCounts(TwitterData, "full_text", MinFreq = 4)

Starting Word Extraction at 2020-02-01 03:12:04.483752
Execution Time: 0:00:01.697684


In [29]:
len(set(TrainingDataWordCounts.columns) - set(TwitterDataWordCounts.columns))

1329

In [30]:
len(set(TwitterDataWordCounts.columns) - set(TrainingDataWordCounts.columns))

173

In [31]:
# Remove columns for words not in model

def GetAlignedWordCounts(SourceData, ReferenceData):
    # Setup resulting Dataframe to ensure word columns align.
    wordCountsData = pd.DataFrame(columns = ReferenceData.columns)

    # Copy over matching columns with data
    for column in wordCountsData.columns.to_list():
        if(column in SourceData.columns.to_list()):
            wordCountsData[column] = SourceData[column]

    # Fill missing word columns with 0
    wordCountsData = wordCountsData.fillna(0)

    for column in wordCountsData.columns.to_list():
        wordCountsData[column] = wordCountsData[column].astype("int8")

    return wordCountsData

In [32]:
SentimentDataWordCounts = GetAlignedWordCounts(TwitterDataWordCounts, TrainingDataWordCounts)
SentimentDataWordCounts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Columns: 1501 entries, able to yummy
dtypes: int8(1501)
memory usage: 1.6 MB


### Model polarity

In [33]:
TrainingData["StarRating"].value_counts()

5    7245
4    7245
1    7245
3    5467
2    3955
Name: StarRating, dtype: int64

#### Scale to range of 2.0 to match range of -1.0 to 1.0 for textblob sentiment

In [34]:
TrainingData["StarRating"] = TrainingData["StarRating"].map({1:2.0, 2:2.0, 3:3.0, 4:3.0, 5:4.0})

#### Bias by +3.0 to set zero point at 2.0

In [35]:
SentimentPredictionBias = 3.0

In [36]:
TrainingData = pd.concat([TrainingData["StarRating"], TrainingDataWordCounts], axis = 1)

In [37]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Columns: 1502 entries, StarRating to yummy
dtypes: float64(1), uint32(1501)
memory usage: 178.6 MB


In [38]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(TrainingData.drop("StarRating", axis = 1), TrainingData["StarRating"], test_size = 0.2, random_state = 13)

In [39]:
print(Train_X.shape)
print(Train_Y.shape)
print(Test_X.shape)
print(Test_Y.shape)

(24925, 1501)
(24925,)
(6232, 1501)
(6232,)


### Train Test Model

In [40]:
# Run through multiple classifiers and rank results

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
import numpy as np

def AssessClassifierModels(TrainingDataColumns, TrainingDataResults, TestingDataColumns, TestingDataResults, Algorithms_List):
    from datetime import datetime
    functionStartTime = datetime.now()
    print()
    
    # results container
    results_list = pd.DataFrame()
    
    # calculated metrics and append to list
    for algorithm in Algorithms_List:
        loopStartTime = datetime.now()
        print("Starting " + str(algorithm.__name__) + " at " + str(loopStartTime))

        algorithmObject = algorithm()
        
        if(str(algorithm.__name__) == "XGBClassifier"):
            algorithmObject = XGBClassifier(nthread=4)

        algorithmObject.fit(TrainingDataColumns, TrainingDataResults)
        algorithmPredictions = algorithmObject.predict(TestingDataColumns)
        (algorithmPrecision, algorithmRecall, algorithmF1, algorithmSupportList) = precision_recall_fscore_support(
            TestingDataResults, algorithmPredictions, labels = np.sort(TrainingDataResults.unique()))
        algorithmExecutionTime = str(datetime.now() - loopStartTime)
        
        results_list = results_list.append({"Name":  algorithm.__name__,
                                            "Precision": algorithmPrecision,
                                            "Recall": algorithmRecall,
                                            "F1": algorithmF1,
                                            "Support": algorithmSupportList,
#                                            "ConfusionMatrix": "",# confusion_matrix(TestingDataResults, algorithmPredictions),
                                            "ModelData" : algorithmObject,
                                            "ExecutionTime": algorithmExecutionTime, 
                                            }, ignore_index = True)


    # Set index to a meaningful value
    results_list.set_index("Name")
    print("Assessment Complete.")
    return results_list

In [41]:
ClassifierResults_List = AssessClassifierModels(Train_X, Train_Y.apply(str).astype("category"), Test_X, Test_Y.apply(str).astype("category"), [XGBClassifier, MultinomialNB, GaussianNB, BernoulliNB, DecisionTreeClassifier, ExtraTreeClassifier])

with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 40):
    print(ClassifierResults_List[["Name", "Precision", "Recall", "F1", "Support", "ExecutionTime"]])


Starting XGBClassifier at 2020-02-01 03:12:13.394766
Starting MultinomialNB at 2020-02-01 03:12:57.244691
Starting GaussianNB at 2020-02-01 03:12:59.067585
Starting BernoulliNB at 2020-02-01 03:13:00.487441
Starting DecisionTreeClassifier at 2020-02-01 03:13:03.857336
Starting ExtraTreeClassifier at 2020-02-01 03:13:13.850055
Assessment Complete.
                     Name  \
0           XGBClassifier   
1           MultinomialNB   
2              GaussianNB   
3             BernoulliNB   
4  DecisionTreeClassifier   
5     ExtraTreeClassifier   

                                                       Precision  \
0   [0.7276134943773428, 0.6402753872633391, 0.7311015118790497]   
1   [0.7999067164179104, 0.6867469879518072, 0.6706270627062706]   
2  [0.7943676939426142, 0.6766081871345029, 0.44696969696969696]   
3   [0.7854100106496272, 0.6864450127877237, 0.4610254272613589]   
4   [0.6415525114155252, 0.5418586789554531, 0.4603616133518776]   
5  [0.6116111611161116, 0.526653306613

In [42]:
PredictionModel = ClassifierResults_List.loc[ClassifierResults_List["Name"] == "MultinomialNB", "ModelData"].iloc[0]
PredictionModel

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Calculate predictions

In [43]:
TwitterSentimentData["PredictedPolarity"] = pd.Series(PredictionModel.predict(SentimentDataWordCounts)).astype("float64") - SentimentPredictionBias

#### Compare predictions to standard

In [44]:
def CompareClassificationPredictions(TestData, ComparisonData):
    import numpy as np
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

    (precision, recall, f1score, supportList) = precision_recall_fscore_support(TestData, ComparisonData, labels = np.sort(TestData.unique()))
    metrics = pd.DataFrame(data = {
        "Precision": precision,
        "Recall": recall,
        "F1": f1score,
        "Support": supportList,
        })

    with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 100):
        print("Confusion Matrix")
        print(confusion_matrix(TestData, ComparisonData))
        print("")
        print(metrics)

In [45]:
CompareClassificationPredictions(TwitterSentimentData["TextBlobPolarity(Bucketed)"], TwitterSentimentData["PredictedPolarity"])

Confusion Matrix
[[  9  11   0]
 [288 475  57]
 [ 69 125  60]]

   Precision    Recall        F1  Support
0   0.024590  0.450000  0.046632       20
1   0.777414  0.579268  0.663871      820
2   0.512821  0.236220  0.323450      254


In [46]:
TwitterSentimentData.describe()

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment,TextBlobPolarity(Bucketed),PredictedPolarity
count,1094.0,1094.0,1094.0,1094.0,1094.0
mean,1.220764e+18,0.145139,0.311635,0.213894,-0.227605
std,90655430000000.0,0.248656,0.291836,0.452652,0.624541
min,1.220597e+18,-0.75,0.0,-1.0,-1.0
25%,1.220699e+18,0.0,0.0,0.0,-1.0
50%,1.220758e+18,0.0,0.3,0.0,0.0
75%,1.220825e+18,0.260691,0.5,0.0,0.0
max,1.220957e+18,1.0,1.0,1.0,1.0
