# Reddit Miners Final Project Code

In [5]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score


from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [6]:
import os

current_directory = os.getcwd()
print("Current Working Directory:", current_directory)


Current Working Directory: /Users/sridharmalladi/Desktop/DF


In [7]:
# Import and display sample of data
data = pd.read_excel('RedditMiners_GoldenDataset.xlsx')
data.head()

Unnamed: 0,ID,Text_Sample,Tech,Political,Business,Entertainment,Misc
0,0,Usually waze is my go to. But today it didnâ€™...,Yes,No,No,No,No
1,1,The Adderall shortage is a real problem. I'm a...,No,No,Yes,No,No
2,2,This is my first ever comment/post on Reddit. ...,Yes,Yes,No,No,No
3,3,This article is hardly an article and more of ...,Yes,Yes,Yes,No,No
4,4,I can help with the cell phone part since I'm ...,Yes,No,Yes,No,No


## Variables to be constructed:

Sentiment - positive, neutral, negative

Length of sample - numerical 

Presence of web address - binary

Count of words in emotional lexicons - Anger, Joy, Sadness, Surprise, Anticipation

In [8]:
# Update dataframe to have columns relating to variables

data['Positive'] = None
data['Neutral'] = None
data['Negative'] = None
data['Length'] = None
data['Web Address'] = None
data['Anger'] = None
data['Joy'] = None
data['Sadness'] = None
data['Surprise'] = None
data['Anticipation'] = None

## Sentiment Analysis

In [9]:
# Sentinement analysis, code taken from PS3 

class LexiconClassifier():
    def __init__(self):
        """
            Initalize the Lexicon classifer by loading lexicons. 
        """
        self.positive_words = set()
        with open('positive-words.txt', encoding = 'utf-8') as iFile:
            for row in iFile:
                self.positive_words.add(row.strip())

        self.negative_words = set()
        with open('negative-words.txt', encoding='iso-8859-1') as iFile:
            for row in iFile:
                self.negative_words.add(row.strip())

    def predict(self, sentence):
        """
            Returns a sentiment prediction give an input string.
            
            Keyword arguments:
            sentence -- string (e.g., "This is good good good")
            
            Returns:
            pred -- a string ("postive, "negative", or "neutral")
        """
        num_pos_words = 0
        num_neg_words = 0
        for word in sentence.lower().split():
            if word in self.positive_words:
                num_pos_words += 1
            elif word in self.negative_words:
                num_neg_words += 1
        
        pred = 'neutral'        
        if num_pos_words > num_neg_words:
            pred = 'positive'
        elif num_pos_words < num_neg_words:
            pred = 'negative'
            
        return pred
    
    def count_pos_words(self, sentence):
        """
            Returns the number of positive words in string
            
            Keyword arguments:
            sentence -- string (e.g., "This is good good good")
            
            Returns:
            pred -- an integer (e.g., 3)
        """
        num_pos_words = 0
        for word in sentence.lower().split():
            if word in self.positive_words:
                num_pos_words += 1
        return num_pos_words

    def count_neg_words(self, sentence):
        """
            Returns the number of negative words in string
            
            Keyword arguments:
            sentence -- string (e.g., "This is good good good")
            
            Returns:
            pred -- an integer (e.g., 3)
        """
        num_neg_words = 0
        for word in sentence.lower().split():
            if word in self.negative_words:
                num_neg_words += 1
        return num_neg_words

In [10]:
# Changes values in dataframe based on sentiment prediction

lex_luthor = LexiconClassifier()

for i in range(data.shape[0]):
    pred = lex_luthor.predict(data['Text_Sample'][i])
    
    if pred == 'positive':
        data['Positive'][i] = 1
        data['Neutral'][i] = 0
        data['Negative'][i] = 0
    
    elif pred == 'neutral':
        data['Positive'][i] = 0
        data['Neutral'][i] = 1
        data['Negative'][i] = 0
        
    elif pred == 'negative':
        data['Positive'][i] = 0
        data['Neutral'][i] = 0
        data['Negative'][i] = 1

FileNotFoundError: [Errno 2] No such file or directory: 'positive-words.txt'

## Length of Sample

In [6]:
# Changes value of length column for each sample

for i in range(data.shape[0]):
    
    val = len(data['Text_Sample'][i].split())
    
    data['Length'][i] = val

In [7]:
print('Max Length: ' + str(data['Length'].max()))
print('Min Length: ' + str(data['Length'].min()))

Max Length: 1429
Min Length: 1


## Web Address Presence

https://www.freecodecamp.org/news/how-to-write-a-regular-expression-for-a-url/

In [8]:
# Changes web address column to 1 if text sample contains a web address 
# Regexpression taken from link above, helped capture some addresses I couldn't seem to catch properly


for i in range(data.shape[0]):
    
    sam = data['Text_Sample'][i]
    
    if re.search(r'(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z0-9]{2,}(\.[a-zA-Z0-9]{2,})(\.[a-zA-Z0-9]{2,})?', sam):
        data['Web Address'][i] = 1
    else:
        data['Web Address'][i] = 0

In [9]:
data['Web Address'].value_counts()

0    792
1    208
Name: Web Address, dtype: int64

## Emotion Word Count

In [10]:
# Function to create a list of emotion associated words from lexicon
# Taken from my submission of Problem set 2

def emotion_list(emotion):
    file = emotion + '-NRC-Emotion-Lexicon.txt'
    text = open(file)
    
    emoList = list()
    
    for row in text: 
        info = row.split('\t')
        if info[1] == '1\n':
            emoList.append(info[0])
        
        elif info[1] == '0\n':
            break
        
    text.close()
    
    return emoList

In [11]:
# Import lexicon words to list

emotions = ['anger', 'joy', 'sadness', 'surprise', 'anticipation']

# Creates a dictionary to use emotes as the keys, then the values as the generated list from the files
emoteDict = dict()

for emot in emotions:
    
    emoteDict[emot] = emotion_list(emot)

In [12]:
# Updates count of words belonging to each emotion in respective emotion column for 5 listed emotions

for i in range(data.shape[0]):
    
    angcount = 0
    joycount = 0
    sadcount = 0
    surpcount = 0
    antcount = 0
    
    sam = data['Text_Sample'][i]
    words = sam.split()
    
    for word in words:
        
        if word in emoteDict['anger']:
            angcount += 1
        
        if word in emoteDict['joy']:
            joycount += 1
            
        if word in emoteDict['sadness']:
            sadcount += 1
            
        if word in emoteDict['surprise']:
            surpcount += 1
        
        if word in emoteDict['anticipation']:
            antcount += 1
        
    data['Anger'][i] = angcount
    data['Joy'][i] = joycount
    data['Sadness'][i] = sadcount
    data['Surprise'][i] = surpcount
    data['Anticipation'][i] = antcount

In [13]:
# Modifies target label data to integers, yes = 1, no = 1

for i in range(data.shape[0]):
    
    if data['Tech'][i] == 'Yes':
        data['Tech'][i] = 1
    else:
        data['Tech'][i] = 0
        
    if data['Political'][i] == 'Yes':
        data['Political'][i] = 1
    else:
        data['Political'][i] = 0
        
    if data['Business'][i] == 'Yes':
        data['Business'][i] = 1
    else:
        data['Business'][i] = 0
        
    if data['Entertainment'][i] == 'Yes':
        data['Entertainment'][i] = 1
    else:
        data['Entertainment'][i] = 0
        
    if data['Misc'][i] == 'Yes':
        data['Misc'][i] = 1
    else:
        data['Misc'][i] = 0

## Final Data for Modeling

In [14]:
data.head()

Unnamed: 0,ID,Text_Sample,Tech,Political,Business,Entertainment,Misc,Positive,Neutral,Negative,Length,Web Address,Anger,Joy,Sadness,Surprise,Anticipation
0,0,Usually waze is my go to. But today it didnâ€™...,1,0,0,0,0,0,1,0,20,0,0,0,0,0,0
1,1,The Adderall shortage is a real problem. I'm a...,0,0,1,0,0,0,0,1,18,0,1,1,1,1,1
2,2,This is my first ever comment/post on Reddit. ...,1,1,0,0,0,0,0,1,60,0,5,0,3,1,1
3,3,This article is hardly an article and more of ...,1,1,1,0,0,0,1,0,60,0,1,1,0,0,0
4,4,I can help with the cell phone part since I'm ...,1,0,1,0,0,0,1,0,81,0,1,1,1,1,3


# Classifier


In [15]:
# Split data for variables and target labels

x_cols = ['Text_Sample','Positive','Neutral','Negative','Length', 'Web Address', 'Anger', 'Joy', 'Sadness', 'Surprise', 'Anticipation']
y_cols = ['Tech','Political', 'Business', 'Entertainment', 'Misc']

X = data[x_cols]
Y = data[y_cols]


In [16]:
X.head()

Unnamed: 0,Text_Sample,Positive,Neutral,Negative,Length,Web Address,Anger,Joy,Sadness,Surprise,Anticipation
0,Usually waze is my go to. But today it didnâ€™...,0,1,0,20,0,0,0,0,0,0
1,The Adderall shortage is a real problem. I'm a...,0,0,1,18,0,1,1,1,1,1
2,This is my first ever comment/post on Reddit. ...,0,0,1,60,0,5,0,3,1,1
3,This article is hardly an article and more of ...,0,1,0,60,0,1,1,0,0,0
4,I can help with the cell phone part since I'm ...,0,1,0,81,0,1,1,1,1,3


In [17]:
Y.head()

Unnamed: 0,Tech,Political,Business,Entertainment,Misc
0,1,0,0,0,0
1,0,0,1,0,0
2,1,1,0,0,0
3,1,1,1,0,0
4,1,0,1,0,0


In [18]:
# Split data

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 47)

In [19]:
# Function to break data out of dataframe object and into numpy array

def df_arr(df):
    tlist = []
    
    for i in range(df.shape[0]):
        tlist.append(df.iloc[i].to_list())

    arr = np.array(tlist)
    
    return arr

In [20]:
x_test_text = x_test[['Text_Sample']]

In [21]:
x_train1 = df_arr(x_train[['Anger', 'Joy', 'Sadness', 'Surprise', 'Anticipation']])
x_test1 = df_arr(x_test[['Anger', 'Joy', 'Sadness', 'Surprise', 'Anticipation']])

x_train2 = df_arr(x_train[['Positive','Neutral','Negative']])
x_test2 = df_arr(x_test[['Positive','Neutral','Negative']])

x_train3 = df_arr(x_train[['Length', 'Web Address']])
x_test3 = df_arr(x_test[['Length', 'Web Address']])

x_train = df_arr(x_train[['Positive','Neutral','Negative', 'Length', 'Web Address', 'Anger', 'Joy', 'Sadness', 'Surprise', 'Anticipation']])
x_test = df_arr(x_test[['Positive','Neutral','Negative', 'Length', 'Web Address', 'Anger', 'Joy', 'Sadness', 'Surprise', 'Anticipation']])

#x_test_text = x_test['Text_Sample']

In [22]:
# Tech
y_tech_train = df_arr(y_train[['Tech']])
y_tech_test = df_arr(y_test[['Tech']])

# Political
y_pol_train = df_arr(y_train[['Political']])
y_pol_test = df_arr(y_test[['Political']])

# Business
y_bus_train = df_arr(y_train[['Business']])
y_bus_test = df_arr(y_test[['Business']])

# Entertainment
y_ent_train = df_arr(y_train[['Entertainment']])
y_ent_test = df_arr(y_test[['Entertainment']])

# Misc
y_misc_train = df_arr(y_train[['Misc']])
y_misc_test = df_arr(y_test[['Misc']])

In [23]:
def run_data(x_train, y_train, x_test, y_test, label):
    
    cols = ['Macro Prec', 'Micro Prec', 'Macro Recall', 'Micro Recall', 'Macro F1', 'Micro F1']
    mod_data = pd.DataFrame(index = ['SVC', 'LinearSVC'], columns = cols)
    
    clf = SVC()
    clf2 = LinearSVC()

    clf.fit(x_train, y_train)
    clf2.fit(x_train, y_train)

    preds = clf.predict(x_test)
    preds2 = clf2.predict(x_test)
    
    fold = 5
    scores = cross_val_score(clf, x_train, y_train, cv = fold)
    scores2 = cross_val_score(clf2, x_train, y_train, cv = fold)
    
    avgsc1 = sum(scores)/len(scores)
    avgsc2 = sum(scores2)/len(scores2)
    
    prec11 = precision_score(y_test, preds, average = 'macro')
    prec12 = precision_score(y_test, preds, average = 'micro')
    mod_data['Macro Prec'][0] = prec11
    mod_data['Micro Prec'][0] = prec12
    
    prec21 = precision_score(y_test, preds2, average = 'macro')
    prec22 = precision_score(y_test, preds2, average = 'micro')
    mod_data['Macro Prec'][1] = prec21
    mod_data['Micro Prec'][1] = prec22
    
    recall11 = recall_score(y_test, preds, average = 'macro')
    recall12 = recall_score(y_test, preds, average = 'micro')
    mod_data['Macro Recall'][0] = recall11
    mod_data['Micro Recall'][0] = recall12
    
    recall21 = recall_score(y_test, preds2, average = 'macro')
    recall22 = recall_score(y_test, preds2, average = 'micro')
    mod_data['Macro Recall'][1] = recall21
    mod_data['Micro Recall'][1] = recall22
    
    
    f1_11 = f1_score(y_test, preds, average = 'macro')
    f1_12 = f1_score(y_test, preds, average = 'micro')
    mod_data['Macro F1'][0] = f1_11
    mod_data['Micro F1'][0] = f1_12
    
    f1_21 = f1_score(y_test, preds2, average = 'macro')
    f1_22 = f1_score(y_test, preds2, average = 'micro')
    mod_data['Macro F1'][1] = f1_21
    mod_data['Micro F1'][1] = f1_22
    
#     print('Precision macro score for SVC under the {} label: {}'.format(label, round(prec11,3)))
#     print('Precision micro score for SVC under the {} label: {}'.format(label, round(prec12,3)))
#     print('Precision macro score for LinearSVC under the {} label: {}'.format(label, round(prec21,3)))
#     print('Precision micro score for LinearSVC under the {} label: {}'.format(label, round(prec22,3)))
#     print()
#     print('Recall macro score for SVC under the {} label: {}'.format(label, round(recall11,3)))
#     print('Recall micro score for SVC under the {} label: {}'.format(label, round(recall12,3)))
#     print('Recall macro score for LinearSVC under the {} label: {}'.format(label, round(recall21,3)))
#     print('Recall micro score for LinearSVC under the {} label: {}'.format(label, round(recall22,3)))
#     print()
#     print('F1 macro score for SVC under the {} label: {}'.format(label, round(f1_11,3)))
#     print('F1 micro score for SVC under the {} label: {}'.format(label, round(f1_12,3)))
#     print('F1 macro score for LinearSVC under the {} label: {}'.format(label, round(f1_21,3)))
#     print('F1 micro score for LinearSVC under the {} label: {}'.format(label, round(f1_22,3)))
#     print()
#     print('Cross validation scores for SVC under the {} label: {}'.format(label, scores))
#     print('Average {} fold cross validations score for SVC under the {} label: {}'.format(fold, label, round(avgsc1,3)))
#     print('Cross validation scores for LinearSVC under the {} label: {}'.format(label, scores2))
#     print('Average {} fold cross validation score for LinearSVC under the {} label: {}'.format(fold, label, round(avgsc2,3)))
#     print()
#     print('===========================================================================')
#     print()

    print(mod_data.head())
    print()
    print('===========================================================================')
    print()


## Feature Set 0: All Variables

In [24]:
run_data(x_train, y_tech_train, x_test, y_tech_test, 'Tech')
run_data(x_train, y_pol_train, x_test, y_pol_test, 'Political')
run_data(x_train, y_bus_train, x_test, y_bus_test, 'Business')
run_data(x_train, y_ent_train, x_test, y_ent_test, 'Entertainment')
run_data(x_train, y_misc_train, x_test, y_misc_test, 'Miscellaneous')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.882943   0.766667     0.507042     0.766667  0.447601  0.766667
LinearSVC   0.381667   0.763333          0.5     0.763333  0.432892  0.763333


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.929766       0.86     0.511628         0.86  0.484957      0.86
LinearSVC   0.875856   0.876667      0.57945     0.876667  0.603557  0.876667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.586202       0.58     0.556897         0.58    0.5275      0.58
LinearSVC   0.594568   0.586667     0.593129     0.586667  0.586207  0.586667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.934783       0.87       0.5125         0.87  0.489507      0.87
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recal

## Feature Set 1: Anger, Joy, Sadness, Surprise, Anticipation

In [25]:
run_data(x_train1, y_tech_train, x_test1, y_tech_test, 'Tech')
run_data(x_train1, y_pol_train, x_test1, y_pol_test, 'Political')
run_data(x_train1, y_bus_train, x_test1, y_bus_test, 'Business')
run_data(x_train1, y_ent_train, x_test1, y_ent_test, 'Entertainment')
run_data(x_train1, y_misc_train, x_test1, y_misc_test, 'Miscellaneous')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.684746   0.766667      0.51676     0.766667   0.47268  0.766667
LinearSVC   0.719388       0.77     0.523802         0.77  0.485982      0.77


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC          0.93266   0.866667     0.534884     0.866667  0.529116  0.866667
LinearSVC    0.93266   0.866667     0.534884     0.866667  0.529116  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.561819   0.563333     0.539318     0.563333  0.504982  0.563333
LinearSVC   0.559829   0.556667     0.527241     0.556667  0.467083  0.556667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.934783       0.87       0.5125         0.87  0.489507      0.87
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recal

## Feature Set 2: Positive, Neutral, Negative

In [26]:
run_data(x_train2, y_tech_train, x_test2, y_tech_test, 'Tech')
run_data(x_train2, y_pol_train, x_test2, y_pol_test, 'Political')
run_data(x_train2, y_bus_train, x_test2, y_bus_test, 'Business')
run_data(x_train2, y_ent_train, x_test2, y_ent_test, 'Entertainment')
run_data(x_train2, y_misc_train, x_test2, y_misc_test, 'Miscellaneous')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.381667   0.763333          0.5     0.763333  0.432892  0.763333
LinearSVC   0.381667   0.763333          0.5     0.763333  0.432892  0.763333


          Macro Prec Micro Prec Macro Recall Micro Recall Macro F1  Micro F1
SVC         0.428333   0.856667          0.5     0.856667   0.4614  0.856667
LinearSVC   0.428333   0.856667          0.5     0.856667   0.4614  0.856667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.592362   0.596667     0.587896     0.596667  0.586518  0.596667
LinearSVC   0.592362   0.596667     0.587896     0.596667  0.586518  0.596667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.433333   0.866667          0.5     0.866667  0.464286  0.866667
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall  

## Feature Set 3: Length, Web Address

In [27]:
run_data(x_train3, y_tech_train, x_test3, y_tech_test, 'Tech')
run_data(x_train3, y_pol_train, x_test3, y_pol_test, 'Political')
run_data(x_train3, y_bus_train, x_test3, y_bus_test, 'Business')
run_data(x_train3, y_ent_train, x_test3, y_ent_test, 'Entertainment')
run_data(x_train3, y_misc_train, x_test3, y_misc_test, 'Miscellaneous')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.882943   0.766667     0.507042     0.766667  0.447601  0.766667
LinearSVC   0.381667   0.763333          0.5     0.763333  0.432892  0.763333


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.929766       0.86     0.511628         0.86  0.484957      0.86
LinearSVC   0.560354   0.653333       0.6137     0.653333  0.541823  0.653333


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1 Micro F1
SVC         0.586202       0.58     0.556897         0.58    0.5275     0.58
LinearSVC   0.613665       0.56     0.581857         0.56  0.536169     0.56


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.934783       0.87       0.5125         0.87  0.489507      0.87
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall  

## Tech Label Predicition

In [28]:
run_data(x_train, y_tech_train, x_test, y_tech_test, 'Tech')
run_data(x_train1, y_tech_train, x_test1, y_tech_test, 'Tech')
run_data(x_train2, y_tech_train, x_test2, y_tech_test, 'Tech')
run_data(x_train3, y_tech_train, x_test3, y_tech_test, 'Tech')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.882943   0.766667     0.507042     0.766667  0.447601  0.766667
LinearSVC   0.381667   0.763333          0.5     0.763333  0.432892  0.763333


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.684746   0.766667      0.51676     0.766667   0.47268  0.766667
LinearSVC   0.719388       0.77     0.523802         0.77  0.485982      0.77


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.381667   0.763333          0.5     0.763333  0.432892  0.763333
LinearSVC   0.381667   0.763333          0.5     0.763333  0.432892  0.763333


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.882943   0.766667     0.507042     0.766667  0.447601  0.766667
LinearSVC   0.381667   0.763333          0.5     0.763333  0.432892  0.763333




## Political Label Predicition

In [29]:
run_data(x_train, y_pol_train, x_test, y_pol_test, 'Tech')
run_data(x_train1, y_pol_train, x_test1, y_pol_test, 'Tech')
run_data(x_train2, y_pol_train, x_test2, y_pol_test, 'Tech')
run_data(x_train3, y_pol_train, x_test3, y_pol_test, 'Tech')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.929766       0.86     0.511628         0.86  0.484957      0.86
LinearSVC   0.428333   0.856667          0.5     0.856667    0.4614  0.856667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC          0.93266   0.866667     0.534884     0.866667  0.529116  0.866667
LinearSVC    0.93266   0.866667     0.534884     0.866667  0.529116  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall Macro F1  Micro F1
SVC         0.428333   0.856667          0.5     0.856667   0.4614  0.856667
LinearSVC   0.428333   0.856667          0.5     0.856667   0.4614  0.856667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1 Micro F1
SVC         0.929766       0.86     0.511628         0.86  0.484957     0.86
LinearSVC   0.929766       0.86     0.511628         0.86  0.484957     0.86




## Business Label Predicition

In [30]:
run_data(x_train, y_bus_train, x_test, y_bus_test, 'Tech')
run_data(x_train1, y_bus_train, x_test1, y_bus_test, 'Tech')
run_data(x_train2, y_bus_train, x_test2, y_bus_test, 'Tech')
run_data(x_train3, y_bus_train, x_test3, y_bus_test, 'Tech')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1 Micro F1
SVC         0.586202       0.58     0.556897         0.58    0.5275     0.58
LinearSVC   0.520134       0.54     0.500537         0.54  0.357143     0.54


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.561819   0.563333     0.539318     0.563333  0.504982  0.563333
LinearSVC    0.55728   0.556667     0.528315     0.556667  0.473885  0.556667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.592362   0.596667     0.587896     0.596667  0.586518  0.596667
LinearSVC   0.592362   0.596667     0.587896     0.596667  0.586518  0.596667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.586202       0.58     0.556897         0.58    0.5275      0.58
LinearSVC   0.585714   0.576667     0.551664     0.576667  0.514965  0.576667




## Entertainment Label Predicition

In [31]:
run_data(x_train, y_ent_train, x_test, y_ent_test, 'Tech')
run_data(x_train1, y_ent_train, x_test1, y_ent_test, 'Tech')
run_data(x_train2, y_ent_train, x_test2, y_ent_test, 'Tech')
run_data(x_train3, y_ent_train, x_test3, y_ent_test, 'Tech')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.934783       0.87       0.5125         0.87  0.489507      0.87
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.934783       0.87       0.5125         0.87  0.489507      0.87
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.433333   0.866667          0.5     0.866667  0.464286  0.866667
LinearSVC   0.433333   0.866667          0.5     0.866667  0.464286  0.866667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1 Micro F1
SVC         0.934783       0.87       0.5125         0.87  0.489507     0.87
LinearSVC   0.470517       0.83     0.489423         0.83   0.47225     0.83




## Misc Label Predicition

In [32]:
run_data(x_train, y_misc_train, x_test, y_misc_test, 'Tech')
run_data(x_train1, y_misc_train, x_test1, y_misc_test, 'Tech')
run_data(x_train2, y_misc_train, x_test2, y_misc_test, 'Tech')
run_data(x_train3, y_misc_train, x_test3, y_misc_test, 'Tech')

          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.423333   0.846667          0.5     0.846667  0.458484  0.846667
LinearSVC   0.423333   0.846667          0.5     0.846667  0.458484  0.846667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.423077   0.843333     0.498031     0.843333  0.457505  0.843333
LinearSVC   0.423077   0.843333     0.498031     0.843333  0.457505  0.843333


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.423333   0.846667          0.5     0.846667  0.458484  0.846667
LinearSVC   0.423333   0.846667          0.5     0.846667  0.458484  0.846667


          Macro Prec Micro Prec Macro Recall Micro Recall  Macro F1  Micro F1
SVC         0.423333   0.846667          0.5     0.846667  0.458484  0.846667
LinearSVC   0.423333   0.846667          0.5     0.846667  0.458484  0.846667




# Best Model Test Data Tech Labeling

Best model for the tech label appears to be the LinearSVC trained on the emotion based feature set 1

In [33]:
best_tech_mod = LinearSVC()
best_tech_mod.fit(x_train1, y_tech_train)
tech_preds = best_tech_mod.predict(x_test1)

Best model for the political label appears to be the SVC trained on complete feature set

In [34]:
best_pol_mod = SVC()
best_pol_mod.fit(x_train, y_pol_train)
pol_preds = best_pol_mod.predict(x_test)

Best model for the business label appears to be LinearSVC on the complete feature set

In [35]:
best_bus_mod = LinearSVC()
best_bus_mod.fit(x_train, y_bus_train)
bus_preds = best_bus_mod.predict(x_test)

Best model for the entertainment label appears to be the SVC() on feature set 3

In [36]:
best_ent_mod = LinearSVC()
best_ent_mod.fit(x_train3, y_ent_train)
ent_preds = best_ent_mod.predict(x_test3)

Best model for the miscellaneous label appears to be LinearSVC on feature set 3

In [37]:
best_misc_mod = LinearSVC()
best_misc_mod.fit(x_train3, y_misc_train)
misc_preds = best_misc_mod.predict(x_test3)

# Output

Test text and ground truth labels alongside the best performing model for each label

In [38]:
out_data = x_test_text.copy()
out_data

Unnamed: 0,Text_Sample
530,"At the moment, it looks like NEISD is trying t..."
926,Google RV rental.\n\nYour best bet may be to p...
586,Ompomp is on the right track. Many classes wil...
25,Most computer repair involves replacing compon...
332,Facebook page or website? Might drop by
...,...
783,Make sure ALL potential running water courses ...
592,Best picture I got from my phone using welding...
664,I think the coolest part of LEGOs is that you ...
709,Why arenâ€™t people more outraged about this


In [39]:
cols = ['Tech Truth', 'Political Truth', 'Business Truth', 'Entertainment Truth', 'Misc Truth',
       'Tech Pred', 'Political Pred', 'Business Pred', 'Entertainment Pred', 'Misc Pred']

for col in cols:
    out_data[col] = None
    
out_data

Unnamed: 0,Text_Sample,Tech Truth,Political Truth,Business Truth,Entertainment Truth,Misc Truth,Tech Pred,Political Pred,Business Pred,Entertainment Pred,Misc Pred
530,"At the moment, it looks like NEISD is trying t...",,,,,,,,,,
926,Google RV rental.\n\nYour best bet may be to p...,,,,,,,,,,
586,Ompomp is on the right track. Many classes wil...,,,,,,,,,,
25,Most computer repair involves replacing compon...,,,,,,,,,,
332,Facebook page or website? Might drop by,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
783,Make sure ALL potential running water courses ...,,,,,,,,,,
592,Best picture I got from my phone using welding...,,,,,,,,,,
664,I think the coolest part of LEGOs is that you ...,,,,,,,,,,
709,Why arenâ€™t people more outraged about this,,,,,,,,,,


In [40]:
out_data = out_data.reset_index()
out_data = out_data.drop(['index'], axis = 1)

In [41]:
# Updates 'None' values with appropriate 'Yes' or 'No' based on various sources

for i in range(out_data.shape[0]):
    
    # Ground truth labels for test samples
    if y_tech_test[i] == 1:
        out_data['Tech Truth'][i] = 'Yes'
    else:
        out_data['Tech Truth'][i] = 'No'
        
    if y_pol_test[i] == 1:
        out_data['Political Truth'][i] = 'Yes'
    else:
        out_data['Political Truth'][i] = 'No'
        
    if y_bus_test[i] == 1:
        out_data['Business Truth'][i] = 'Yes'
    else:
        out_data['Business Truth'][i] = 'No'
        
    if y_ent_test[i] == 1:
        out_data['Entertainment Truth'][i] = 'Yes'
    else:
        out_data['Entertainment Truth'][i] = 'No'
        
    if y_misc_test[i] == 1:
        out_data['Misc Truth'][i] = 'Yes'
    else:
        out_data['Misc Truth'][i] = 'No'
        
    
    # Prediction labels for test samples
    if tech_preds[i] == 1:
        out_data['Tech Pred'][i] = 'Yes'
    else:
        out_data['Tech Pred'][i] = 'No'
        
    if pol_preds[i] == 1:
        out_data['Political Pred'][i] = 'Yes'
    else:
        out_data['Political Pred'][i] = 'No'
        
    if bus_preds[i] == 1:
        out_data['Business Pred'][i] = 'Yes'
    else:
        out_data['Business Pred'][i] = 'No'
        
    if ent_preds[i] == 1:
        out_data['Entertainment Pred'][i] = 'Yes'
    else:
        out_data['Entertainment Pred'][i] = 'No'
        
    if misc_preds[i] == 1:
        out_data['Misc Pred'][i] = 'Yes'
    else:
        out_data['Misc Pred'][i] = 'No'
        
out_data

Unnamed: 0,Text_Sample,Tech Truth,Political Truth,Business Truth,Entertainment Truth,Misc Truth,Tech Pred,Political Pred,Business Pred,Entertainment Pred,Misc Pred
0,"At the moment, it looks like NEISD is trying t...",Yes,No,No,No,No,Yes,No,Yes,No,No
1,Google RV rental.\n\nYour best bet may be to p...,Yes,No,Yes,No,No,Yes,No,No,No,No
2,Ompomp is on the right track. Many classes wil...,Yes,No,No,No,Yes,Yes,No,Yes,No,No
3,Most computer repair involves replacing compon...,Yes,No,Yes,No,No,Yes,No,No,No,No
4,Facebook page or website? Might drop by,Yes,No,No,No,No,Yes,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...
295,Make sure ALL potential running water courses ...,Yes,No,No,No,Yes,Yes,No,No,No,No
296,Best picture I got from my phone using welding...,Yes,No,No,No,Yes,Yes,No,No,No,No
297,I think the coolest part of LEGOs is that you ...,Yes,No,No,Yes,No,Yes,No,Yes,No,No
298,Why arenâ€™t people more outraged about this,No,No,No,No,Yes,Yes,No,No,No,No


In [42]:
#out_data.to_csv('RedditMiners_PredictionsVsTest.csv')