In [66]:
import pandas as pd
import numpy as np

import nltk
# nltk.download('stopwords')
# nltk.download('punkt_tab')

from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import json


# Data Modeling 
# pick two topics to compare
# develop a way to identify which topic a post is talking about 
# check occurences of different topics of conversation over time
# could these different conversation topics appear during a game period more often?


In [67]:
# text2tokens, gen_bow, porterstemmer

ps = PorterStemmer()

def text2tokens(text):
	stop_words = set(stopwords.words('english'))
	text = str(text)
	text = text.lower()
	textList = word_tokenize(text)
	textList = [word for word in textList if word not in stop_words and len(word) >= 3]

	textList = [ps.stem(word) for word in textList]
	return textList
    
def gen_bow(df, column):
	df['tokens'] = df[column].apply(text2tokens)
	dct = Dictionary(df['tokens'])
	dct.filter_extremes(no_below=5, no_above=0.5)
	df['bow'] = df['tokens'].apply(dct.doc2bow)
	worddict = {}

	for i in range(len(df)):
		for token in df.at[i,'bow']:
			if token in list(worddict.keys()):
				worddict[token] += 1
			else: 
				worddict[token] = 1

	df.drop('tokens', axis=1, inplace=True)	
	return df, worddict



In [68]:
gdf = pd.read_csv('hw2_step1_games.csv')
rdf = pd.read_csv('hw2_step1_reddit_posts.csv')
posNegWords = pd.read_excel('posNegList.xlsx').dropna()
# print(posNegWords.dtypes)


posWords = [ps.stem(word) for word in posNegWords['Positive Sense Word List'].to_list()]
negWords = [ps.stem(word) for word in posNegWords['Negative Sense Word List'].to_list()]



def get_sentiment(list):
	sentiment = 0
	
	for word in list:
		if word in posWords:
			sentiment += 1
		elif word in negWords:
			sentiment -= 1
	
	return sentiment
			


# rdf, word_dict = gen_bow(rdf, 'text')
rdf['tokens'] = rdf['text'].apply(text2tokens)
# rdf['sentiment'] = rdf['tokens'].apply(get_sentiment)

sent_array = np.array(rdf['tokens'].apply(get_sentiment))
max_sent = sent_array.max()
min_sent = sent_array.min()

rdf['sentiment'] = (sent_array - min_sent) / (max_sent - min_sent)

display(rdf.sort_values(by='sentiment'))


Unnamed: 0,text,time_of_post,game,team,tokens,sentiment
3183,SHIT PISS AND CUM SHIT PISS AND CUM SHIT PISS ...,2019-10-20,[],BOS,"[shit, piss, cum, shit, piss, cum, shit, piss,...",0.000000
3166,"Congrats, Houston!\n\nFUCK THE YANKEES\n\nFUCK...",2019-10-20,[],BOS,"[congrat, houston, fuck, yanke, fuck, yanke, f...",0.152416
1487,FUCK CHRIS SALE AND HIS GOOFY ASS DELIVERY\n\n...,2019-09-10,['401076896'],NYY,"[fuck, chri, sale, goofi, ass, deliveri, fuck,...",0.226766
912,"Fuck you Kluber, Fuck you Bauer, Fuck you Jose...",2017-10-11,[],NYY,"[fuck, kluber, fuck, bauer, fuck, jose, ramire...",0.234201
2459,Fuck this fucking bullshit\n\nDavid price has ...,2018-09-19,['380919110'],NYY,"[fuck, fuck, bullshit, david, price, domin, ya...",0.234201
...,...,...,...,...,...,...
2742,David Ortiz changed my life and I'll always ca...,2016-10-12,[],BOS,"[david, ortiz, chang, life, 'll, alway, carri,...",0.639405
5946,Is...this post in the right place? Have I mis...,2016-04-26,['360426114'],TOR,"[..., post, right, place, miss, someth, know, ...",0.639405
2478,Gary Sanchez has had one of the most up and do...,2021-06-24,['401228171'],NYY,"[gari, sanchez, one, career, athlet, last, thi...",0.776952
3291,"Excuse the throwaway (for anonymity), but I pr...",2018-10-19,['381018118'],BOS,"[excus, throwaway, anonym, promis, say, true, ...",0.914498


In [75]:
gdf = gdf.drop_duplicates()

for idx, row in gdf.iterrows():
    diff = row['home-score'] - row['away-score']
    if diff > 0:
        gdf.at[idx, 'winner'] = row['home']
    elif diff < 0:
        gdf.at[idx, 'winner'] = row['away']
    else:
        gdf.at[idx, 'winner'] = 'N/A'

bal_games = gdf.loc[(gdf['home'] == 'BAL') | (gdf['away'] == 'BAL')]
nyy_games = gdf.loc[(gdf['home'] == 'NYY') | (gdf['away'] == 'NYY')]
bos_games = gdf.loc[(gdf['home'] == 'BOS') | (gdf['away'] == 'BOS')]
tor_games = gdf.loc[(gdf['home'] == 'TOR') | (gdf['away'] == 'TOR')]
tb_games = gdf.loc[(gdf['home'] == 'TB') | (gdf['away'] == 'TB')]

team_sdfs = {
    "BAL": bal_games,
    "NYY": nyy_games,
    "BOS": bos_games,
    "TOR": tor_games,
    "TB": tb_games
}


def game_outcome(game_ids, team):
    win_diff = 0
    team_games = team_sdfs[team]
    games = []
    for idx, row in team_games.iterrows():
        game = str(row['game'])
        if game in game_ids:
            # display(row)
            games.append(row)

    for game in games:
        if row['winner'] == team:
            win_diff +=1
        elif row['winner'] != 'N/A':
            win_diff -= 1
    return win_diff




for idx, row in rdf.iterrows():
    team = row['team']
    # print(team)
    game_ids = row['game']
    # print(game_ids)

    if len(game_ids) == 0:
        rdf.at[idx, 'team_performance'] = 0
    else:
        outcome = game_outcome(game_ids, team)
        rdf.at[idx, 'team_performance'] = outcome



display(rdf)

Unnamed: 0,text,time_of_post,game,team,tokens,sentiment,team_performance
1170,THESE VAGABOND SHOESSSSSSS,2017-10-17,"['371016110', '371017110']",NYY,"[vagabond, shoesssssss]",0.256506,-2.0
844,Can we get candle flairs up in here,2020-09-05,"['401234674', '401226263']",NYY,"[get, candl, flair]",0.252788,-2.0
843,I drunk bought a pair of boosts tonight after ...,2020-09-05,"['401234674', '401226263']",NYY,"[drunk, bought, pair, boost, tonight, game, mo...",0.267658,-2.0
842,"Sorry, but I can't code in the scent of candle...",2020-09-05,"['401234674', '401226263']",NYY,"[sorri, n't, code, scent, candl, css]",0.260223,-2.0
841,🕯🌸Pinstripe Peonies🌸🕯,2020-09-05,"['401234674', '401226263']",NYY,"[🕯🌸pinstrip, peonies🌸🕯]",0.256506,-2.0
...,...,...,...,...,...,...,...
1499,#DAAAAAAAA JANKEES WIN!,2019-09-09,"['401076871', '401076886']",NYY,"[daaaaaaaa, janke, win]",0.260223,-2.0
1498,FAWK THOSE GUYS!!! #FEELSGOODMAN!!!,2019-09-09,"['401076871', '401076886']",NYY,"[fawk, guy, feelsgoodman]",0.256506,-2.0
1497,You love to see it,2019-09-09,"['401076871', '401076886']",NYY,"[love, see]",0.260223,-2.0
1494,Fuck Boston!,2019-09-09,"['401076871', '401076886']",NYY,"[fuck, boston]",0.252788,-2.0


In [76]:
rdf = rdf.sort_values(by='team_performance')
gdf.to_csv('hw2_step2_games.csv')
rdf.to_csv('hw2_step2_reddit_posts.csv')