# Import Tweet Data

In [1]:
import pandas as pd
import pickle

with open("TwitterRawData.dat", "rb") as filePath:
    TwitterData = pd.DataFrame(pickle.load(file=filePath))[["id", "full_text"]]

In [2]:
TwitterData.head()

Unnamed: 0,id,full_text
0,1220957331149557765,i guess its time to switch majors. data scienc...
1,1220955374867701761,#TechnoCool: Data Science Community Rocked by ...
2,1220954168057389056,Confused about how data science and data analy...
3,1220953376189366272,Creating Robust Python Workflows: Learn to dev...
4,1220952323167440896,National Level Seminar on COMPUTATIONAL MATHEM...


In [3]:
TwitterData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 2 columns):
id           1094 non-null int64
full_text    1094 non-null object
dtypes: int64(1), object(1)
memory usage: 17.2+ KB


In [4]:
TwitterData["full_text"].apply(len).describe()

count    1094.000000
mean      178.421389
std        78.420762
min        23.000000
25%       111.250000
50%       168.000000
75%       255.000000
max       319.000000
Name: full_text, dtype: float64

# Clean Data

In [5]:
import html

# Convert html encoded special characters to usable format
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: html.unescape(x))

In [6]:
import re
# Drop URI's completely.
TwitterData["full_text"] = TwitterData["full_text"].apply(
    lambda x: re.sub(string = x, pattern = "https\:\/\/[\w]+[.]?[\w]+?[\/\w]+\/*", repl = ""))

In [7]:
# Extract hash tags to split later into constituent words
TwitterData["HashTags"] = TwitterData["full_text"].apply(lambda x: re.findall(string = x, pattern = r"\#\w+\b"))

In [8]:
# Converts hashtags to plain words for later processing.
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: re.sub(string = x, pattern = "[\#]*", repl = ""))

In [9]:
# Remove emails and @user
TwitterData["full_text"] = TwitterData["full_text"].apply(
    lambda x: re.sub(string = x, pattern = "\b?[a-zA-Z0-9\.\_\%\+\-]*@[a-zA-Z0-9\.\-\_]+\b?", repl = ""))

In [10]:
# Set all text to lowercase to simply processing
TwitterData["full_text"] = TwitterData["full_text"].str.lower()

In [11]:
display(TwitterData.head())

Unnamed: 0,id,full_text,HashTags
0,1220957331149557765,i guess its time to switch majors. data scienc...,[]
1,1220955374867701761,technocool: data science community rocked by p...,"[#TechnoCool, #tech, #technology, #datascience..."
2,1220954168057389056,confused about how data science and data analy...,"[#CareerKarma, #breakintotech, #21DayCkChallenge]"
3,1220953376189366272,creating robust python workflows: learn to dev...,[#DataScience]
4,1220952323167440896,national level seminar on computational mathem...,[]


In [12]:
display(TwitterData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id           1094 non-null int64
full_text    1094 non-null object
HashTags     1094 non-null object
dtypes: int64(1), object(2)
memory usage: 25.8+ KB


None

#### Extract and normalize contractions and abbreviations

#### Correct mispellings

# Calculate Sentiment

In [13]:
# Installing nltk modules
# import nltk
# nltk.download("punkt")
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')
# nltk.download("wordnet")
# nltk.help.upenn_tagset()

## Create Reference Sentiment
>Using TextBlob built-in

In [14]:
#!pip install --trusted-host pypi.python.org textblob

def GetTextBlobSentiments(TwitterData):
    from textblob import TextBlob
    import pandas as pd

    sentimentData = pd.concat([
        TwitterData["id"],
        pd.DataFrame(
            columns = ["TextBlobPolarity", "TextBlobSentiment"],
            data = [TextBlob(x).sentiment for x in TwitterData["full_text"]],
        )
    ], axis = 1)
    return sentimentData

In [15]:
TwitterSentimentData = GetTextBlobSentiments(TwitterData)

In [16]:
TwitterSentimentData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id                   1094 non-null int64
TextBlobPolarity     1094 non-null float64
TextBlobSentiment    1094 non-null float64
dtypes: float64(2), int64(1)
memory usage: 25.8 KB


In [17]:
TwitterSentimentData.head(10)

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment
0,1220957331149557765,0.0,0.0
1,1220955374867701761,0.5,0.5
2,1220954168057389056,0.05625,0.55
3,1220953376189366272,0.0,0.0
4,1220952323167440896,-0.6,1.0
5,1220950780783415298,0.4,0.9
6,1220950425932726272,-0.4,0.7
7,1220949449528291329,0.0,0.0
8,1220949247249412096,0.268651,0.40377
9,1220948833565175808,0.5,0.5


In [18]:
TwitterSentimentData.describe()

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment
count,1094.0,1094.0,1094.0
mean,1.220764e+18,0.145139,0.311635
std,90655430000000.0,0.248656,0.291836
min,1.220597e+18,-0.75,0.0
25%,1.220699e+18,0.0,0.0
50%,1.220758e+18,0.0,0.3
75%,1.220825e+18,0.260691,0.5
max,1.220957e+18,1.0,1.0


## Create sentiment data by training with pre-labeled text corpus

### Generate word counts for predictions

#### Load Sentiment Training Dataset from file

In [19]:
def LoadYelpReviewData():
    import pandas as pd
    return pd.read_csv("YelpReviewData.csv", dtype = {"StarRating": "int8", "ReviewText":"str"})

In [20]:
TrainingData = LoadYelpReviewData()

In [21]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Data columns (total 2 columns):
StarRating    31157 non-null int8
ReviewText    31157 non-null object
dtypes: int8(1), object(1)
memory usage: 274.0+ KB


In [22]:
TrainingData["StarRating"].value_counts()

5    7245
4    7245
1    7245
3    5467
2    3955
Name: StarRating, dtype: int64

#### Expand contractions and abbreviations

#### Tokenize words and tag parts of speech.

#### Drop undesirable words

#### Lemmatize adjectives, words, nouns

In [23]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader import wordnet
from nltk.util import ngrams

LemmatizerEngine = WordNetLemmatizer()

POSTagToLemmaTag_Dict = {
    "J" : wordnet.ADJ,
    "N" : wordnet.NOUN,
    "V" : wordnet.VERB,
    "R" : wordnet.ADV,
}

def FilterForKeyWords(TextString):
    removeWords_List = list(set([
        # Prepositions
        "of", "with", "without", "at", "from", "into", "during", "including", "until", "against", "through", "throughput",
        "towards", "to", "upon", "concerning", "in", "out", "for", "on", "below", "by", "over", "under", "despite",
        "before", "after", "between", "since", "among", "along", "following", "across", "behind", "beyond", "except",
        "but", "up", "down", "aboard", "amid", "as", "behind", "considering", "during", "inside", "minus", "off", "per",
        "versus", "via",
    ]))
    alphaCheck = re.compile(r"^[a-z]+$")

    return str([LemmatizerEngine.lemmatize(word, POSTagToLemmaTag_Dict[pos[0]]) 
                for (word, pos) in nltk.pos_tag(nltk.word_tokenize(TextString.lower()))
            if (
                (len(word) > 1)
                & (alphaCheck.match(word) != None)
                & (word not in removeWords_List)
                & (pos[0] in [
                    "J",#"JJ", "JJR", "JJS", # Adjectives
                    #"N",#"NN", "NNS", "NNP", "NNPS", # Nouns
                    "R",#"RB", "RBR", "RBS", # Adverbs
                    "V",#"VB", "VBD", "VBG", "VBN", "VBP", "VBZ", #Verbs
                    ])
               )
               ])

def GetPrincipalWordCounts(DataFrame, TextColumnName, MinFreq = 2):
    from datetime import datetime # For debugging performance data

    Vectorizer = CountVectorizer(lowercase = False, strip_accents = "ascii", preprocessor = FilterForKeyWords,
                                 min_df = MinFreq, ngram_range = (1, 3),
                                )
    startTime = datetime.now() # For debugging performance data
    print("Starting Word Extraction at " + str(startTime))

    # Filter out unwanted words in each row, then create count columns for remaining words 
    WordCounts = pd.DataFrame(
        Vectorizer.fit_transform(DataFrame[TextColumnName]).toarray(), 
        columns=Vectorizer.get_feature_names(), 
        dtype = "uint",
    )

    print("Execution Time: " + str(datetime.now() - startTime)) # For debugging performance data

    return WordCounts

In [24]:
TrainingDataWordCounts = GetPrincipalWordCounts(TrainingData, "ReviewText", MinFreq = 0.005)
display(TrainingDataWordCounts.info())

Starting Word Extraction at 2020-02-01 01:57:15.830663
Execution Time: 0:03:13.162132
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Columns: 1501 entries, able to yummy
dtypes: uint32(1501)
memory usage: 178.4 MB


None

In [25]:
TrainingDataWordCounts.head()

Unnamed: 0,able,able get,about,absolutely,accept,accommodate,acknowledge,act,actual,actually,...,worth,wrap,write,wrong,wrong be,yell,yes,yet,young,yummy
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
TrainingDataWordCounts.sum()

able          1157
able get       177
about          737
absolutely    1138
accept         212
              ... 
yell           199
yes            184
yet            944
young          480
yummy          391
Length: 1501, dtype: int64

### Match word count columns from Twitter data

In [27]:
TwitterDataWordCounts = GetPrincipalWordCounts(TwitterData, "full_text", MinFreq = 4)

Starting Word Extraction at 2020-02-01 02:00:29.171475
Execution Time: 0:00:01.944360


In [28]:
len(set(TrainingDataWordCounts.columns) - set(TwitterDataWordCounts.columns))

1329

In [29]:
len(set(TwitterDataWordCounts.columns) - set(TrainingDataWordCounts.columns))

173

In [30]:
# Remove columns for words not in model

def GetAlignedWordCounts(SourceData, ReferenceData):
    # Setup resulting Dataframe to ensure word columns align.
    wordCountsData = pd.DataFrame(columns = ReferenceData.columns)

    # Copy over matching columns with data
    for column in wordCountsData.columns.to_list():
        if(column in SourceData.columns.to_list()):
            wordCountsData[column] = SourceData[column]

    # Fill missing word columns with 0
    wordCountsData = wordCountsData.fillna(0)

    for column in wordCountsData.columns.to_list():
        wordCountsData[column] = wordCountsData[column].astype("int8")

    return wordCountsData

In [31]:
SentimentDataWordCounts = GetAlignedWordCounts(TwitterDataWordCounts, TrainingDataWordCounts)
SentimentDataWordCounts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Columns: 1501 entries, able to yummy
dtypes: int8(1501)
memory usage: 1.6 MB


### Model polarity

In [32]:
TrainingData["StarRating"].value_counts()

5    7245
4    7245
1    7245
3    5467
2    3955
Name: StarRating, dtype: int64

#### Scale to range of 2.0 to match range of -1.0 to 1.0 for textblob sentiment

In [33]:
TrainingData["StarRating"] = TrainingData["StarRating"].map({1:2.0, 2:2.5, 3:3.0, 4:3.5, 5:4.0})

#### Bias by +3.0 to set zero point at 2.0

In [34]:
SentimentPredictionBias = 3.0

In [35]:
TrainingData = pd.concat([TrainingData["StarRating"], TrainingDataWordCounts], axis = 1)

In [36]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Columns: 1502 entries, StarRating to yummy
dtypes: float64(1), uint32(1501)
memory usage: 178.6 MB


In [37]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(TrainingData.drop("StarRating", axis = 1), TrainingData["StarRating"], test_size = 0.2, random_state = 13)

In [38]:
print(Train_X.shape)
print(Train_Y.shape)
print(Test_X.shape)
print(Test_Y.shape)

(24925, 1501)
(24925,)
(6232, 1501)
(6232,)


### Train Test Model

In [39]:
import pandas as pd
def TrainXGBRegressionModel(x_train, y_train, x_test, y_test):
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    import xgboost as xgb
    from xgboost import XGBRegressor
    from datetime import datetime
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    learningRate = 0.05
    maxDepth = 12
    estimatorsCount = 400
    
    loopStartTime = datetime.now()

    xgbModel = XGBRegressor(objective = "reg:squarederror",
                            colsample_bytree = 1,
                            colsample_bylevel = 1,
                            colsample_bynode = 1,
                            learning_rate = learningRate,
                            max_depth = maxDepth,
                            tree_method = "hist",
                            grow_policy = "lossguide",
                            n_estimators = estimatorsCount,
                            nthread = 6,
                            booster="gbtree",
                           )

    print("Starting XGBRegressor at " + str(loopStartTime))

    xgbModel.fit(x_train, y_train)
    xgbPredictor = xgbModel.predict(x_test)

    loopEndTime = datetime.now()
    # results container
    results_list = pd.DataFrame(data = {
        "Name" : "XGBRegressor",
        "R2 Score": r2_score(y_test, xgbPredictor),
        "RMS Error": (mean_squared_error(y_test, xgbPredictor)),
        "Mean Absolute Error": mean_absolute_error(y_test, xgbPredictor),
        "ModelData" : xgbModel,
        "RunTime": str(loopEndTime - loopStartTime),
        "LearningRate": str(learningRate),
        "MaxDepth":str(maxDepth),
        "Estimators":[str(estimatorsCount)],
        })

    print("\tEnding XGBRegressor at " + str(datetime.now()))

    return results_list

In [40]:
XGBResults = TrainXGBRegressionModel(Train_X, Train_Y, Test_X, Test_Y)

with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 40):
    display(XGBResults.drop("ModelData", axis = 1))

Starting XGBRegressor at 2020-02-01 02:00:38.566459
	Ending XGBRegressor at 2020-02-01 02:03:34.172983


Unnamed: 0,Name,R2 Score,RMS Error,Mean Absolute Error,RunTime,LearningRate,MaxDepth,Estimators
0,XGBRegressor,0.614857,0.212045,0.359084,0:02:55.603513,0.05,12,400


In [41]:
PredictionModel = XGBResults["ModelData"][0]

#### Calculate predictions

In [42]:
TwitterSentimentData["PredictedPolarity"] = pd.Series(PredictionModel.predict(SentimentDataWordCounts)).astype("float64") - SentimentPredictionBias

#### Compare predictions to standard

In [43]:
def CompareRegressionPredictions(TestData, ComparisonData):
    import numpy as np
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix
    
    metrics = pd.DataFrame(data = {
        "R2 Score": r2_score(TestData, ComparisonData),
        "RMS Error": mean_squared_error(TestData, ComparisonData),
        "Mean Absolute Error": [mean_absolute_error(TestData, ComparisonData)],
        })
    
    with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 100):
        print(metrics)

In [44]:
CompareRegressionPredictions(TwitterSentimentData["TextBlobPolarity"], TwitterSentimentData["PredictedPolarity"])

   R2 Score  RMS Error  Mean Absolute Error
0 -0.018816   0.062935             0.175754


In [45]:
TwitterSentimentData["PredictedPolarity"].describe()

count    1094.000000
mean        0.055099
std         0.159446
min        -0.652050
25%        -0.003292
50%         0.050542
75%         0.050542
max         0.766508
Name: PredictedPolarity, dtype: float64

In [46]:
TwitterSentimentData["TextBlobPolarity"].describe()

count    1094.000000
mean        0.145139
std         0.248656
min        -0.750000
25%         0.000000
50%         0.000000
75%         0.260691
max         1.000000
Name: TextBlobPolarity, dtype: float64