# Import Tweet Data

In [1]:
import pandas as pd
import pickle

# with open("TwitterKeys.dat", "rb") as SaveFile:
#     ReadData = pickle.load(file=SaveFile)
# from twython import Twython
# TwitterStream = Twython(ReadData["ConsumerKey"],ReadData["ConsumerSecret"])

# Temporarily load tweet data from saved file to keep debug and testing consistent
with open("TwitterRawData.dat", "rb") as filePath:
    TwitterData = pd.DataFrame(pickle.load(file=filePath))[["id", "full_text"]]

In [2]:
TwitterData.head()

Unnamed: 0,id,full_text
0,1220957331149557765,i guess its time to switch majors. data scienc...
1,1220955374867701761,#TechnoCool: Data Science Community Rocked by ...
2,1220954168057389056,Confused about how data science and data analy...
3,1220953376189366272,Creating Robust Python Workflows: Learn to dev...
4,1220952323167440896,National Level Seminar on COMPUTATIONAL MATHEM...


In [3]:
TwitterData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 2 columns):
id           1094 non-null int64
full_text    1094 non-null object
dtypes: int64(1), object(1)
memory usage: 17.2+ KB


In [4]:
TwitterData["full_text"].apply(len).describe()

count    1094.000000
mean      178.421389
std        78.420762
min        23.000000
25%       111.250000
50%       168.000000
75%       255.000000
max       319.000000
Name: full_text, dtype: float64

# Clean Data

In [5]:
import html

# Convert html encoded special characters to usable format
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: html.unescape(x))

print(TwitterData["full_text"].head())

0    i guess its time to switch majors. data scienc...
1    #TechnoCool: Data Science Community Rocked by ...
2    Confused about how data science and data analy...
3    Creating Robust Python Workflows: Learn to dev...
4    National Level Seminar on COMPUTATIONAL MATHEM...
Name: full_text, dtype: object


In [6]:
import re
# Drop URI's completely.
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: re.sub(string = x, pattern = "https\:\/\/[\w]+[.]?[\w]+?[\/\w]+\/*", repl = ""))

print(TwitterData["full_text"].head())

0    i guess its time to switch majors. data scienc...
1    #TechnoCool: Data Science Community Rocked by ...
2    Confused about how data science and data analy...
3    Creating Robust Python Workflows: Learn to dev...
4    National Level Seminar on COMPUTATIONAL MATHEM...
Name: full_text, dtype: object


In [7]:
# Extract hash tags to split later into constituent words
TwitterData["HashTags"] = TwitterData["full_text"].apply(lambda x: re.findall(string = x, pattern = r"\#\w+\b"))
    
print(TwitterData["HashTags"].head())

0                                                   []
1    [#TechnoCool, #tech, #technology, #datascience...
2    [#CareerKarma, #breakintotech, #21DayCkChallenge]
3                                       [#DataScience]
4                                                   []
Name: HashTags, dtype: object


In [8]:
# Converts hashtags to plain words for later processing.
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: re.sub(string = x, pattern = "[\#]*", repl = ""))

print(TwitterData["full_text"].head())

0    i guess its time to switch majors. data scienc...
1    TechnoCool: Data Science Community Rocked by P...
2    Confused about how data science and data analy...
3    Creating Robust Python Workflows: Learn to dev...
4    National Level Seminar on COMPUTATIONAL MATHEM...
Name: full_text, dtype: object


In [9]:
# Remove emails and @user
TwitterData["full_text"] = TwitterData["full_text"].apply(lambda x: re.sub(string = x, pattern = "\b?[a-zA-Z0-9\.\_\%\+\-]*@[a-zA-Z0-9\.\-\_]+\b?", repl = ""))
print(TwitterData.head())

                    id                                          full_text  \
0  1220957331149557765  i guess its time to switch majors. data scienc...   
1  1220955374867701761  TechnoCool: Data Science Community Rocked by P...   
2  1220954168057389056  Confused about how data science and data analy...   
3  1220953376189366272  Creating Robust Python Workflows: Learn to dev...   
4  1220952323167440896  National Level Seminar on COMPUTATIONAL MATHEM...   

                                            HashTags  
0                                                 []  
1  [#TechnoCool, #tech, #technology, #datascience...  
2  [#CareerKarma, #breakintotech, #21DayCkChallenge]  
3                                     [#DataScience]  
4                                                 []  


In [10]:
print(TwitterData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id           1094 non-null int64
full_text    1094 non-null object
HashTags     1094 non-null object
dtypes: int64(1), object(2)
memory usage: 25.8+ KB
None


In [11]:
# Set all text to lowercase to simply processing
TwitterData["full_text"] = TwitterData["full_text"].str.lower()

#### Extract and normalize contractions and abbreviations

In [4]:
import nltk
#nltk.download("punkt")
#nltk.download('averaged_perceptron_tagger')
#nltk.download('tagsets')
#nltk.download("wordnet")
#nltk.help.upenn_tagset()

[nltk_data] Error loading corpus: Package 'corpus' not found in index


False

#### Correct mispellings

***
***

In [None]:
# Temporary save data to file for debug consistency
import pickle
with open("TwitterTextData.dat", "wb") as filePath:
    pickle.dump(TwitterData, file=filePath)

In [1]:
import pickle
import pandas as pd
with open("TwitterTextData.dat", "rb") as filePath:
    TwitterData = pd.DataFrame(pickle.load(filePath))

TwitterData.head()

Unnamed: 0,id,full_text,HashTags
0,1220957331149557765,i guess its time to switch majors. data scienc...,[]
1,1220955374867701761,technocool: data science community rocked by p...,"[#TechnoCool, #tech, #technology, #datascience..."
2,1220954168057389056,confused about how data science and data analy...,"[#CareerKarma, #breakintotech, #21DayCkChallenge]"
3,1220953376189366272,creating robust python workflows: learn to dev...,[#DataScience]
4,1220952323167440896,national level seminar on computational mathem...,[]


In [13]:
TwitterData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id           1094 non-null int64
full_text    1094 non-null object
HashTags     1094 non-null object
dtypes: int64(1), object(2)
memory usage: 25.8+ KB


# Calculate Sentiment

## Create Reference Sentiment
>Using TextBlob built-in

In [14]:
#!pip install --trusted-host pypi.python.org textblob

from textblob import TextBlob
import numpy as np

SentimentData = pd.concat([
    TwitterData["id"],
    pd.DataFrame(
        columns = ["TextBlobPolarity", "TextBlobSentiment"],
        data = [TextBlob(x).sentiment for x in TwitterData["full_text"]],
    )
], axis = 1)

In [15]:
SentimentData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id                   1094 non-null int64
TextBlobPolarity     1094 non-null float64
TextBlobSentiment    1094 non-null float64
dtypes: float64(2), int64(1)
memory usage: 25.8 KB


In [16]:
SentimentData.describe()

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment
count,1094.0,1094.0,1094.0
mean,1.220764e+18,0.145139,0.311635
std,90655430000000.0,0.248656,0.291836
min,1.220597e+18,-0.75,0.0
25%,1.220699e+18,0.0,0.0
50%,1.220758e+18,0.0,0.3
75%,1.220825e+18,0.260691,0.5
max,1.220957e+18,1.0,1.0


In [17]:
SentimentData.head()

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment
0,1220957331149557765,0.0,0.0
1,1220955374867701761,0.5,0.5
2,1220954168057389056,0.05625,0.55
3,1220953376189366272,0.0,0.0
4,1220952323167440896,-0.6,1.0


In [18]:
# Temporary save data to file for debug consistency
import pickle
with open("TextblobSentiment.dat", "wb") as filePath:
    pickle.dump(SentimentData, file=filePath)

In [40]:
import pickle
import pandas as pd
with open("TextblobSentiment.dat", "rb") as filePath:
    SentimentData = pd.DataFrame(pickle.load(filePath))

SentimentData.head()

Unnamed: 0,id,TextBlobPolarity,TextBlobSentiment
0,1220957331149557765,0.0,0.0
1,1220955374867701761,0.5,0.5
2,1220954168057389056,0.05625,0.55
3,1220953376189366272,0.0,0.0
4,1220952323167440896,-0.6,1.0


In [19]:
SentimentData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 3 columns):
id                   1094 non-null int64
TextBlobPolarity     1094 non-null float64
TextBlobSentiment    1094 non-null float64
dtypes: float64(2), int64(1)
memory usage: 25.8 KB


In [20]:
SentimentData["TextBlobPolarity"].describe()

count    1094.000000
mean        0.145139
std         0.248656
min        -0.750000
25%         0.000000
50%         0.000000
75%         0.260691
max         1.000000
Name: TextBlobPolarity, dtype: float64

## Create sentiment data by training with pre-labeled text corpus

### Generate word counts for predictions

#### Load Sentiment Training Dataset from file

In [10]:
import pandas as pd
TrainingData = pd.read_csv("YelpReviewData.csv", dtype = {"StarRating": "int8", "ReviewText":"str"})

In [11]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189354 entries, 0 to 189353
Data columns (total 2 columns):
StarRating    189354 non-null int8
ReviewText    189354 non-null object
dtypes: int8(1), object(1)
memory usage: 1.6+ MB


In [12]:
TrainingData["StarRating"].value_counts()

5    43983
4    43983
1    43983
3    33173
2    24232
Name: StarRating, dtype: int64

#### Expand contractions and abbreviations

#### Tokenize words and tag parts of speech.

#### Drop undesirable words

#### Lemmatize adjectives, words, nouns

#### Generate n-grams

In [100]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader import wordnet
from nltk.util import ngrams

LemmatizerEngine = WordNetLemmatizer()

POSTagToLemmaTag_Dict = {
    "J" : wordnet.ADJ,
    "N" : wordnet.NOUN,
    "V" : wordnet.VERB,
    "R" : wordnet.ADV,
}

def FilterForKeyWords(TextString):
    removeWords_List = list(set([
        # Prepositions
        "of", "with", "without", "at", "from", "into", "during", "including", "until", "against", "through", "throughput",
        "towards", "to", "upon", "concerning", "in", "out", "for", "on", "below", "by", "over", "under", "despite",
        "before", "after", "between", "since", "among", "along", "following", "across", "behind", "beyond", "except",
        "but", "up", "down", "aboard", "amid", "as", "behind", "considering", "during", "inside", "minus", "off", "per",
        "versus", "via",
    ]))
    alphaCheck = re.compile(r"^[a-z]+$")

    return str([LemmatizerEngine.lemmatize(word, POSTagToLemmaTag_Dict[pos[0]]) for (word, pos) in nltk.pos_tag(nltk.word_tokenize(TextString.lower()))
            if (
                (len(word) > 1)
                & (alphaCheck.match(word) != None)
                & (word not in removeWords_List)
                & (pos[0] in [
                    "J",#"JJ", "JJR", "JJS", # Adjectives
                    #"N",#"NN", "NNS", "NNP", "NNPS", # Nouns
                    "R",#"RB", "RBR", "RBS", # Adverbs
                    "V",#"VB", "VBD", "VBG", "VBN", "VBP", "VBZ", #Verbs
                    ])
               )
               ])

def GetPrincipalWordCounts(DataFrame, TextColumnName):
    from datetime import datetime # For debugging performance data

    Vectorizer = CountVectorizer(lowercase = False, strip_accents = "ascii", preprocessor = FilterForKeyWords,
                                 min_df = 0.005, ngram_range = (1, 3),
                                )
    startTime = datetime.now() # For debugging performance data

    # Filter out unwanted words in each row, then create count columns for remaining words 
    WordCounts = pd.DataFrame(
        Vectorizer.fit_transform(DataFrame[TextColumnName]).toarray(), 
        columns=Vectorizer.get_feature_names(), 
        dtype = "uint8",
    )

    print(str(datetime.now() - startTime)) # For debugging performance data

    print(WordCounts.info())
    return WordCounts

In [101]:
TrainingData["ReviewText"][0]

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [102]:
from datetime import datetime
print("Vectorizing Start. ", str(datetime.now()))
TrainingDataWordCounts = GetPrincipalWordCounts(TrainingData.iloc[:1000, :], "ReviewText")
print("Vectorizing End. ", str(datetime.now()))

Vectorizing Start.  2020-01-30 04:17:43.561429
0:00:09.423795
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 1999 entries, able to yummy be
dtypes: uint8(1999)
memory usage: 1.9 MB
None
Vectorizing End.  2020-01-30 04:17:53.152775


In [103]:
TrainingDataWordCounts.head()

Unnamed: 0,able,able get,about,about be,absolutely,accommodate,accompany,accompany be,acknowledge,act,...,yeah,yell,yes,yet,yet be,yet have,young,young be,yummy,yummy be
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
TrainingDataWordCounts.sum().sort_values()

be highly          5
very little        5
end have           5
have only be       5
be great too       5
                ... 
get              728
not              879
do              1041
have            1757
be              6229
Length: 1999, dtype: int64

In [22]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\TrainingDataWordCounts.dat", "wb") as filePath:
    pickle.dump(TrainingDataWordCounts, file=filePath)

In [7]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\TrainingDataWordCounts.dat", "rb") as filePath:
    TrainingDataWordCounts = pickle.load(file=filePath)

In [28]:
TrainingDataWordCounts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189354 entries, 0 to 189353
Columns: 1743 entries, 00 to zero
dtypes: uint8(1743)
memory usage: 314.8 MB


#### Generate matching word count columns from Twitter data

In [29]:
TwitterDataWordCounts = GetPrincipalWordCounts(TwitterData, "full_text")

0:00:03.105048
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Columns: 514 entries, 10 to your
dtypes: uint8(514)
memory usage: 549.3 KB
None


In [30]:
len(set(TrainingDataWordCounts.columns) - set(TwitterDataWordCounts.columns))

1483

In [31]:
len(set(TwitterDataWordCounts.columns) - set(TrainingDataWordCounts.columns))

254

In [35]:
# Remove columns for words not in model

# Setup resulting Dataframe to ensure word columns align.
SentimentDataWordCounts = pd.DataFrame(columns = TrainingDataWordCounts.columns)

# Copy over matching columns with data
for column in TwitterWordCounts.columns.to_list():
    if(column in SentimentDataWordCounts.columns.to_list()):
        SentimentDataWordCounts[column] = TwitterWordCounts[column]

# Fill missing word columns with 0
SentimentDataWordCounts = SentimentDataWordCounts.fillna(0)

for column in SentimentDataWordCounts.columns.to_list():
    SentimentDataWordCounts[column] = SentimentDataWordCounts[column].astype("int8")

SentimentDataWordCounts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Columns: 1743 entries, 00 to zero
dtypes: int64(1483), uint8(260)
memory usage: 12.6 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Columns: 1743 entries, 00 to zero
dtypes: int8(1743)
memory usage: 1.8 MB


### Model polarity

In [36]:
TrainingData["StarRating"].value_counts()

5    43983
4    43983
1    43983
3    33173
2    24232
Name: StarRating, dtype: int64

In [37]:
TrainingData["StarRating"] = TrainingData["StarRating"].map({1:0.0, 2:0.4, 3:0.8, 4:1.2, 5:2.0})

In [38]:
TrainingData = pd.concat([TrainingData["StarRating"], WordCounts], axis = 1)

In [39]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189354 entries, 0 to 189353
Columns: 1744 entries, StarRating to zero
dtypes: float64(1), uint8(1743)
memory usage: 316.2 MB


In [40]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(TrainingData.drop("StarRating", axis = 1), TrainingData["StarRating"], test_size = 0.2, random_state = 13)

In [41]:
print(Train_X.shape)
print(Train_Y.shape)
print(Test_X.shape)
print(Test_Y.shape)

(151483, 1743)
(151483,)
(37871, 1743)
(37871,)


In [51]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\TrainTestData.dat", "wb") as filePath:
    pickle.dump(Train_X, file=filePath)
    pickle.dump(Train_Y, file=filePath)
    pickle.dump(Test_X, file=filePath)
    pickle.dump(Test_Y, file=filePath)

In [28]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\TrainTestData.dat", "rb") as filePath:
    Train_X = pickle.load(file=filePath)
    Train_Y = pickle.load(file=filePath)
    Test_X = pickle.load(file=filePath)
    Test_Y = pickle.load(file=filePath)

In [29]:
print(Train_X.shape)
print(Train_Y.shape)
print(Test_X.shape)
print(Test_Y.shape)

(151483, 1743)
(151483,)
(37871, 1743)
(37871,)


### Train Test Classifier

In [None]:
# Run through multiple classifiers and rank results

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
import numpy as np

def AssessClassifierModels(TrainingDataColumns, TrainingDataResults, TestingDataColumns, TestingDataResults, Algorithms_List):
    from datetime import datetime
    functionStartTime = datetime.now()
    print()
    
    # results container
    results_list = pd.DataFrame( columns = ["Name",
                                            "Precision",
                                            "Recall",
                                            "F1",
                                            "Support",
                                            "ModelData",
                                            "ExecutionTime",
                                           ]) # Set index later to avoid empty row
    
    # calculated metrics and append to list
    for algorithm in Algorithms_List:
        loopStartTime = datetime.now()
        print("Starting " + str(algorithm.__name__) + " at " + str(loopStartTime))

        algorithmObject = algorithm()
        
        if(str(algorithm.__name__) == "XGBClassifier"):
            algorithmObject = XGBClassifier(nthread=4)

        algorithmObject.fit(TrainingDataColumns, TrainingDataResults)
        algorithmPredictions = algorithmObject.predict(TestingDataColumns)
        (algorithmPrecision, algorithmRecall, algorithmF1, algorithmSupportList) = precision_recall_fscore_support(
            TestingDataResults, algorithmPredictions, labels = np.sort(TrainingDataResults.unique()))
        algorithmExecutionTime = str(datetime.now() - loopStartTime)
        
        results_list = results_list.append({"Name":  algorithm.__name__,
                                            "Precision": algorithmPrecision,
                                            "Recall": algorithmRecall,
                                            "F1": algorithmF1,
                                            "Support": algorithmSupportList,
#                                            "ConfusionMatrix": "",# confusion_matrix(TestingDataResults, algorithmPredictions),
                                            "ModelData" : algorithmObject,
                                            "ExecutionTime": algorithmExecutionTime, 
                                            }, ignore_index = True)
#         print("\tEnding " + str(algorithm.__name__) + " at " + str(datetime.now()) + "\n")
        
#         with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
#             print(results_list.iloc[len(results_list)-1, :])

    # Set index to a meaningful value
    results_list.set_index("Name")
    print("Assessment Complete.")
    return results_list

In [None]:
ClassifierResults_List = AssessClassifierModels(Train_X, Train_Y.apply(str).astype("category"), Test_X, Test_Y.apply(str).astype("category"), [XGBClassifier, MultinomialNB, GaussianNB, BernoulliNB, KNeighborsClassifier, DecisionTreeClassifier, ExtraTreeClassifier])

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
    print(ClassifierResults_List[["Name", "Precision", "Recall", "F1", "Support", "ExecutionTime"]])

In [None]:
# Save models to file for debug consistency
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\ClassifierResults.dat", "wb") as filePath:
    pickle.dump(ClassifierResults_List, file=filePath)

In [None]:
import gc
#del ClassifierResults_List
gc.collect()
gc.collect()

In [74]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\ClassifierResults.dat", "rb") as filePath:
    ClassifierResults_List = pickle.load(file=filePath)
    with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
        print(ClassifierResults_List.drop("ModelData", axis = 1))

                     Name  \
0           XGBClassifier   
1           MultinomialNB   
2              GaussianNB   
3             BernoulliNB   
4  DecisionTreeClassifier   
5     ExtraTreeClassifier   

                                                                                                 Precision  \
0    [0.43410852713178294, 0.62510845045983, 0.46622542339887096, 0.45076060848678945, 0.5988313520048358]   
1    [0.32357043235704325, 0.6955237446034992, 0.4434848484848485, 0.5111710323574731, 0.6925350122060902]   
2    [0.2851963746223565, 0.6786147419485244, 0.3448133635801196, 0.39832775919732444, 0.4130954570535222]   
3  [0.30156537753222834, 0.6761139311900733, 0.37977315689981095, 0.45304172027200884, 0.3993250127356088]   
4    [0.21397849462365592, 0.5853964632059326, 0.2966114572253457, 0.3647869815798395, 0.5114114779525815]   
5    [0.19717376904393905, 0.4862628268785171, 0.2645865834633385, 0.3355742935278031, 0.4458374573848015]   

                         

In [58]:
PredictionModel = ClassifierResults_List.loc[ClassifierResults_List["Name"] == "MultinomialNB", "ModelData"].iloc[0]

In [None]:
PredictionModel

In [59]:
import gc
del ClassifierResults_List
gc.collect()

40

### Predict Twitter Sentiment

In [67]:
def CompareClassificationPredictions(TestData, ComparisonData):
    import numpy as np
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

    (precision, recall, f1score, supportList) = precision_recall_fscore_support(TestData, ComparisonData, labels = np.sort(TestData.unique()))
    metrics = pd.DataFrame(data = {
        "Precision": precision,
        "Recall": recall,
        "F1": f1score,
        "Support": supportList,
        })


#     metrics = pd.DataFrame(data = {
#         "Accuracy": accuracy_score(TestData, ComparisonData),
#         "Precision": precision_score(TestData, ComparisonData),
#         "Recall": recall_score(TestData, ComparisonData),
#         "F1": [f1_score(TestData, ComparisonData)],
#         })
    
    with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
        print("Confusion Matrix")
        print(confusion_matrix(TestData, ComparisonData))
        print("")
        print(metrics)

#### Calculate predictions

In [68]:
SentimentPredictions = pd.Series(PredictionModel.predict(TwitterAdjustedWordCounts)).astype("float64")

#### Compare predictions to standard

In [62]:
SentimentPredictions.describe()

count    1094.000000
mean        0.102377
std         0.922442
min        -1.000000
25%        -1.000000
50%         0.500000
75%         1.000000
max         1.000000
dtype: float64

In [63]:
SentimentData["TextBlobPolarity"].describe()

count    1094.000000
mean        0.145139
std         0.248656
min        -0.750000
25%         0.000000
50%         0.000000
75%         0.260691
max         1.000000
Name: TextBlobPolarity, dtype: float64

In [71]:

CompareClassificationPredictions(SentimentData["TextBlobPolarity"].apply(lambda x: 1.0 if (x > 0.4) else -1.0 if (x < -0.4) else 0.0), SentimentPredictions.apply(lambda x: 1.0 if (x > 0.4) else -1.0 if (x < -0.4) else 0.0))

Confusion Matrix
[[  7   0   6]
 [372  11 524]
 [ 59   8 107]]

   Precision    Recall        F1  Support
0   0.015982  0.538462  0.031042       13
1   0.578947  0.012128  0.023758      907
2   0.167975  0.614943  0.263872      174


In [42]:
import pandas as pd
def AssessXGBRegression(x_train, y_train, x_test, y_test, MaxDepth = 6, LearningRate = 0.1):
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    import xgboost as xgb
    from xgboost import XGBRegressor
    from datetime import datetime
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    loopStartTime = datetime.now()
    # results container
    results_list = pd.DataFrame( columns = ["Name", "R2 Score", "RMS Error", "Mean Absolute Error", "ModelData", "ExecutionTime"])

    xgbModel = XGBRegressor(objective = "reg:squarederror",
                            colsample_bytree = 1,
                            colsample_bylevel = 1,
                            colsample_bynode = 1,
                            learning_rate = LearningRate,
                            max_depth = MaxDepth,
                            tree_method = "hist",
                            grow_policy = "lossguide",
                            n_estimators = 200,
                            nthread = 6,
                           )

    print("Starting XGBRegressor at " + str(loopStartTime))
    print("\tLearning Rate: ", str(LearningRate), "\tTree Depth: ", str(MaxDepth))

    xgbModel.fit(x_train, y_train)
    xgbPredictor = xgbModel.predict(x_test)

    loopEndTime = datetime.now()
    results_list = results_list.append({"Name" : "XGBRegressor",
                                        "R2 Score": r2_score(y_test, xgbPredictor),
                                        "RMS Error": (mean_squared_error(y_test, xgbPredictor)),
                                        "Mean Absolute Error": mean_absolute_error(y_test, xgbPredictor),
                                        "ModelData" : xgbModel,
                                        "ExecutionTime": str(loopEndTime - loopStartTime),
                                        "LearningRate": str(LearningRate),
                                        "MaxDepth":str(MaxDepth),
                                        }, ignore_index = True)

    print("\tEnding XGBRegressor at " + str(datetime.now()))

    return results_list

In [43]:
def CompareRegressionPredictions(TestData, ComparisonData):
    import numpy as np
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix
    
    metrics = pd.DataFrame(data = {
        "R2 Score": r2_score(TestData, ComparisonData),
        "RMS Error": mean_squared_error(TestData, ComparisonData),
        "Mean Absolute Error": [mean_absolute_error(TestData, ComparisonData)],
        })
    
    with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
        print(metrics)

In [44]:
XGBResults = AssessXGBRegression(Train_X, Train_Y, Test_X, Test_Y, LearningRate = 0.2, MaxDepth = 9)

with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
    print(XGBResults.drop("ModelData", axis = 1))

Starting XGBRegressor at 2020-01-25 15:40:34.170720
	Learning Rate:  0.2 	Tree Depth:  9
	Ending XGBRegressor at 2020-01-25 15:45:25.172392
           Name  R2 Score  RMS Error  Mean Absolute Error   ExecutionTime  \
0  XGBRegressor  0.642646   0.186775             0.338399  0:04:50.985665   

  LearningRate MaxDepth  
0          0.2        9  


In [32]:
import pickle
with open("XGBRegressorResults.dat", "rb") as filePath:
    XGBResults = pickle.load(file=filePath)

In [33]:
XGBResults

Unnamed: 0,Name,R2 Score,RMS Error,Mean Absolute Error,ModelData,ExecutionTime,LearningRate,MaxDepth
0,XGBRegressor,0.667182,0.734399,0.671007,"XGBRegressor(base_score=0.5, booster='gbtree',...",0:04:37.756705,0.2,9


In [45]:
PredictionModel = XGBResults["ModelData"][0]

In [46]:
SentimentPredictions = pd.Series(PredictionModel.predict(TwitterAdjustedWordCounts)).astype("float64")

In [50]:
CompareRegressionPredictions(SentimentData["TextBlobPolarity"].apply(lambda x: (x + 1.0)), SentimentPredictions)

   R2 Score  RMS Error  Mean Absolute Error
0  -0.56532   0.096695              0.23428


In [48]:
SentimentPredictions.describe()

count    1094.000000
mean        0.944958
std         0.151800
min         0.403817
25%         0.892358
50%         0.911873
75%         0.963613
max         1.612821
dtype: float64