## Text Pre-processing

In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import time
import string
import warnings
import glob
import os

# for all NLP related operations on text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.classify import NaiveBayesClassifier
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# To mock web-browser and scrap tweets
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# To consume Twitter's API - not needed
#import tweepy
#from tweepy import OAuthHandler 

# To identify the sentiment of text
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor

# ignoring all the warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# downloading stopwords corpus
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('conll2000')
nltk.download('brown')
nltk.download('stopwords')
stopwords = set(stopwords.words("english"))

# for showing all the plots inline
%matplotlib inline

ModuleNotFoundError: No module named 'tweepy'

## Merge all .csv files

In [2]:
from pathlib import Path
path = 'raw data'
file_list = os.listdir('raw data')

full_path = []

for f in file_list:
    full_path.append(os.path.join(path,f))
    
full_path

["raw data/women's six nations_2022-01-01_2022-12-31.csv",
 'raw data/.DS_Store',
 'raw data/World Cup_2022-01-01_2022-12-31.csv',
 'raw data/Untitled.ipynb',
 'raw data/Super Bowl_2022-01-01_2022-12-31.csv',
 "raw data/Women's EURO_2022-01-01_2022-12-31.csv",
 'raw data/Winter Paralympics_2022-05-12_2022-12-31.csv',
 'raw data/Rugby World Cup_2022-01-01_2022-12-31.csv',
 'raw data/Rainbow Laces_2022-01-01_2022-12-31.csv',
 'raw data/The Open_2022-01-01_2022-12-31.csv',
 'raw data/EFL League_2022-01-01_2022-12-31.csv',
 'raw data/.ipynb_checkpoints',
 'raw data/EFL Play-Off_2022-01-01_2022-12-31.csv',
 'raw data/World Athletics Championships_2022-01-01_2022-12-31.csv',
 'raw data/Invictus Games_2022-01-01_2022-12-31.csv']

In [3]:
excl_list = []
for file in full_path:
    if file.endswith('.csv'):
        df = pd.read_csv(file)
        df['source'] = file
        excl_list.append(df)
        

# concatenate all DataFrames in the list
# into a single DataFrame, returns new DataFrame.
excl_merged = pd.concat(excl_list, ignore_index=True)
 
# exports the dataframe into excel file
# with specified name.
excl_merged.to_csv('combine_test.csv', index=False)

In [4]:
tweets_df = pd.read_csv('combine_test.csv')

# Get Event Name
tweets_df['source'] = tweets_df['source'].replace(regex=r"\_.*",value="").str.replace('raw data/','')
tweets_df['source'].unique()

array(["women's six nations", 'World Cup', 'Super Bowl', "Women's EURO",
       'Winter Paralympics', 'Rugby World Cup', 'Rainbow Laces',
       'The Open', 'EFL League', 'EFL Play-Off',
       'World Athletics Championships', 'Invictus Games'], dtype=object)

## Data Cleaning

In [5]:
tweets_df['tidy_tweets'] = tweets_df['Embedded_text'].str.replace("[^a-zA-Z# ]", "")

  tweets_df['tidy_tweets'] = tweets_df['Embedded_text'].str.replace("[^a-zA-Z# ]", "")


In [6]:
tweets_df = tweets_df[tweets_df['tidy_tweets']!='']
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
0,dianne pugh,@dpugh01,2022-01-01T23:49:03.000Z,"dianne pugh\r\n@dpugh01\r\n·\r\nJan 2, 2022",What a year it's been for England Rugby. Women...,🏆 🏟 🙌 💙,,,,[],https://twitter.com/dpugh01/status/14774266770...,women's six nations,What a year its been for England Rugby Womens ...
1,Ana Sanz,@AnaSanz73755440,2022-01-02T09:25:05.000Z,"Ana Sanz\r\n@AnaSanz73755440\r\n·\r\nJan 2, 2022",What a year it's been for England Rugby. Women...,🏆 🏟 🙌 💙,,,,[],https://twitter.com/AnaSanz73755440/status/147...,women's six nations,What a year its been for England Rugby Womens ...


In [7]:
tweets_df = tweets_df.astype(object).replace(np.nan, '0')
## tweets_df.to_csv('cleaned.csv')

In [8]:
def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    return text 

In [9]:
tweets_df['tidy_tweets'] = np.vectorize(remove_pattern)(tweets_df['tidy_tweets'], "@[\w]*: | *RT*")
tweets_df.tail(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
28106,Grey_Potato,@KimChikanga,2022-12-30T18:15:16.000Z,"Grey_Potato\n@KimChikanga\n·\nDec 31, 2022",Meghan Markle introducing Prince Harry at the ...,🥔,0,0,1,[],https://twitter.com/KimChikanga/status/1608889...,Invictus Games,Meghan Markle introducing Prince Harry at the ...
28107,Portia Brazen Hussy you say?,@poolton_portia,2022-12-30T19:58:35.000Z,Portia Brazen Hussy you say?\n@poolton_portia\...,Looking forward to next yr. #InvictusGames\nIn...,🐼 🐘 💛 🖤 💜,0,0,1,['https://pbs.twimg.com/profile_images/1582758...,https://twitter.com/poolton_portia/status/1608...,Invictus Games,Looking forward to next yr #InvictusGamesInvic...


In [10]:
cleaned_tweets = []

for index, row in tweets_df.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

tweets_df['tidy_tweets'] = cleaned_tweets

In [11]:
tweets_df = tweets_df.drop_duplicates(subset=['tidy_tweets'], keep=False)

In [12]:
tweets_df = tweets_df.reset_index(drop=True)
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
0,Virgin Media Sport,@VMSportIE,2022-01-04T09:30:59.000Z,Virgin Media Sport\r\n@VMSportIE\r\n·\r\nJan 4...,Men's Guinness Six Nations\r\n Women's Six Na...,▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪,0,3.0,8,['https://pbs.twimg.com/media/FIP3uZCXEAARS4w....,https://twitter.com/VMSportIE/status/147829790...,women's six nations,Mens Guinness Six Nations Womens Six Nations U...
1,Sport For Women,@SportForWomen,2022-01-04T20:10:38.000Z,Sport For Women\r\n@SportForWomen\r\n·\r\nJan ...,Your guide to the Women's Six Nations 2022 htt...,0,0,6.0,1,[],https://twitter.com/SportForWomen/status/14784...,women's six nations,Your guide to the Womens Six Nations via their...


In [13]:
stopwords_set = set(stopwords)
cleaned_tweets = []

for index, row in tweets_df.iterrows():
    
    # filerting out all the stopwords 
    words_without_stopwords = [word for word in row.tidy_tweets.split() if not word in stopwords_set and '#' not in word.lower()]
    
    # finally creating tweets list of tuples containing stopwords(list) and sentimentType 
    cleaned_tweets.append(' '.join(words_without_stopwords))
    
tweets_df['tidy_tweets'] = cleaned_tweets
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets
0,Virgin Media Sport,@VMSportIE,2022-01-04T09:30:59.000Z,Virgin Media Sport\r\n@VMSportIE\r\n·\r\nJan 4...,Men's Guinness Six Nations\r\n Women's Six Na...,▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪,0,3.0,8,['https://pbs.twimg.com/media/FIP3uZCXEAARS4w....,https://twitter.com/VMSportIE/status/147829790...,women's six nations,Mens Guinness Six Nations Womens Six Nations U...
1,Sport For Women,@SportForWomen,2022-01-04T20:10:38.000Z,Sport For Women\r\n@SportForWomen\r\n·\r\nJan ...,Your guide to the Women's Six Nations 2022 htt...,0,0,6.0,1,[],https://twitter.com/SportForWomen/status/14784...,women's six nations,Your guide Womens Six Nations via theirishpost...


In [14]:
tokenized_tweet = tweets_df['tidy_tweets'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [Mens, Guinness, Six, Nations, Womens, Six, Na...
1    [Your, guide, Womens, Six, Nations, via, their...
2    [Irelands, Six, Nations, Fixtures, For, Men, W...
3    [Fighting, Stigma, Irelands, Six, Nations, Fix...
4    [Irelands, Six, Nations, Fixtures, For, Men, W...
Name: tidy_tweets, dtype: object

In [15]:
word_lemmatizer = WordNetLemmatizer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [word_lemmatizer.lemmatize(i) for i in x])
tokenized_tweet.head()

0    [Mens, Guinness, Six, Nations, Womens, Six, Na...
1    [Your, guide, Womens, Six, Nations, via, their...
2    [Irelands, Six, Nations, Fixtures, For, Men, W...
3    [Fighting, Stigma, Irelands, Six, Nations, Fix...
4    [Irelands, Six, Nations, Fixtures, For, Men, W...
Name: tidy_tweets, dtype: object

In [16]:
for i, tokens in enumerate(tokenized_tweet):
    tokenized_tweet[i] = ' '.join(tokens)

tweets_df['tidy_tweets'] = tokenized_tweet

In [17]:
tweets_df['Comments']

0          0
1          0
2          0
3          0
4          0
        ... 
25448    2.0
25449    8.0
25450      0
25451      0
25452      0
Name: Comments, Length: 25453, dtype: object

In [18]:
tweets_df['Timestamp'] = tweets_df['Timestamp'].astype('datetime64[ns]')
tweets_df['Likes'] = tweets_df['Likes'].str.replace(',','').str.replace('K','').astype('float')
tweets_df['Comments'] = tweets_df['Comments'].str.replace(',','').str.replace('K','').astype('float')
tweets_df['Retweets'] = tweets_df['Retweets'].str.replace(',','').str.replace('K','').astype('float')
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25453 entries, 0 to 25452
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   UserScreenName  25453 non-null  object        
 1   UserName        25453 non-null  object        
 2   Timestamp       25453 non-null  datetime64[ns]
 3   Text            25453 non-null  object        
 4   Embedded_text   25453 non-null  object        
 5   Emojis          25453 non-null  object        
 6   Comments        25453 non-null  float64       
 7   Likes           25453 non-null  float64       
 8   Retweets        25453 non-null  float64       
 9   Image link      25453 non-null  object        
 10  Tweet URL       25453 non-null  object        
 11  source          25453 non-null  object        
 12  tidy_tweets     25453 non-null  object        
dtypes: datetime64[ns](1), float64(3), object(9)
memory usage: 2.5+ MB


## Get Sentiment Score
reference for the defination of SIA score: https://analyticsindiamag.com/sentiment-analysis-made-easy-using-vader/

In [19]:
def fetch_sentiment_using_SIA(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

def fetch_sentiment_score(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return polarity_scores

In [20]:
sentiments_using_SIA = tweets_df.tidy_tweets.apply(lambda tweet: fetch_sentiment_using_SIA(tweet))
tweets_df['sentiment'] = sentiments_using_SIA
pd.DataFrame(sentiments_using_SIA.value_counts())

Unnamed: 0,tidy_tweets
pos,21746
neg,3707


In [21]:
tweets_df['sentiment_score'] = tweets_df.tidy_tweets.apply(lambda tweet: fetch_sentiment_score(tweet))

In [22]:
tweets_df["sentiment_score"].apply(pd.Series)
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets,sentiment,sentiment_score
0,Virgin Media Sport,@VMSportIE,2022-01-04 09:30:59,Virgin Media Sport\r\n@VMSportIE\r\n·\r\nJan 4...,Men's Guinness Six Nations\r\n Women's Six Na...,▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪,0.0,3.0,8.0,['https://pbs.twimg.com/media/FIP3uZCXEAARS4w....,https://twitter.com/VMSportIE/status/147829790...,women's six nations,Mens Guinness Six Nations Womens Six Nations U...,pos,"{'neg': 0.0, 'neu': 0.779, 'pos': 0.221, 'comp..."
1,Sport For Women,@SportForWomen,2022-01-04 20:10:38,Sport For Women\r\n@SportForWomen\r\n·\r\nJan ...,Your guide to the Women's Six Nations 2022 htt...,0,0.0,6.0,1.0,[],https://twitter.com/SportForWomen/status/14784...,women's six nations,Your guide Womens Six Nations via theirishpost...,pos,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [23]:
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets,sentiment,sentiment_score
0,Virgin Media Sport,@VMSportIE,2022-01-04 09:30:59,Virgin Media Sport\r\n@VMSportIE\r\n·\r\nJan 4...,Men's Guinness Six Nations\r\n Women's Six Na...,▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪,0.0,3.0,8.0,['https://pbs.twimg.com/media/FIP3uZCXEAARS4w....,https://twitter.com/VMSportIE/status/147829790...,women's six nations,Mens Guinness Six Nations Womens Six Nations U...,pos,"{'neg': 0.0, 'neu': 0.779, 'pos': 0.221, 'comp..."
1,Sport For Women,@SportForWomen,2022-01-04 20:10:38,Sport For Women\r\n@SportForWomen\r\n·\r\nJan ...,Your guide to the Women's Six Nations 2022 htt...,0,0.0,6.0,1.0,[],https://twitter.com/SportForWomen/status/14784...,women's six nations,Your guide Womens Six Nations via theirishpost...,pos,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [24]:
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets,sentiment,sentiment_score
0,Virgin Media Sport,@VMSportIE,2022-01-04 09:30:59,Virgin Media Sport\r\n@VMSportIE\r\n·\r\nJan 4...,Men's Guinness Six Nations\r\n Women's Six Na...,▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪,0.0,3.0,8.0,['https://pbs.twimg.com/media/FIP3uZCXEAARS4w....,https://twitter.com/VMSportIE/status/147829790...,women's six nations,Mens Guinness Six Nations Womens Six Nations U...,pos,"{'neg': 0.0, 'neu': 0.779, 'pos': 0.221, 'comp..."
1,Sport For Women,@SportForWomen,2022-01-04 20:10:38,Sport For Women\r\n@SportForWomen\r\n·\r\nJan ...,Your guide to the Women's Six Nations 2022 htt...,0,0.0,6.0,1.0,[],https://twitter.com/SportForWomen/status/14784...,women's six nations,Your guide Womens Six Nations via theirishpost...,pos,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [55]:
tweets_df = pd.concat([tweets_df, tweets_df["sentiment_score"].apply(pd.Series)], axis=1)
tweets_df = tweets_df.drop(columns="sentiment_score")

KeyError: 'sentiment_score'

In [32]:
tweets_df['sen_pos'] = 1
tweets_df.loc[tweets_df['sentiment'] == 'neg', "sen_pos"] = 0
tweets_df.head(2)

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,source,tidy_tweets,sentiment,neg,neu,pos,compound,sen_pos
0,Virgin Media Sport,@VMSportIE,2022-01-04 09:30:59,Virgin Media Sport\r\n@VMSportIE\r\n·\r\nJan 4...,Men's Guinness Six Nations\r\n Women's Six Na...,▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪ ▪,0.0,3.0,8.0,['https://pbs.twimg.com/media/FIP3uZCXEAARS4w....,https://twitter.com/VMSportIE/status/147829790...,women's six nations,Mens Guinness Six Nations Womens Six Nations U...,pos,0.0,0.779,0.221,0.7783,1
1,Sport For Women,@SportForWomen,2022-01-04 20:10:38,Sport For Women\r\n@SportForWomen\r\n·\r\nJan ...,Your guide to the Women's Six Nations 2022 htt...,0,0.0,6.0,1.0,[],https://twitter.com/SportForWomen/status/14784...,women's six nations,Your guide Womens Six Nations via theirishpost...,pos,0.0,1.0,0.0,0.0,1


## Pivot Table

In [33]:
tweets_df.to_csv('sentiment_analysis_test.csv')
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25453 entries, 0 to 25452
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   UserScreenName  25453 non-null  object        
 1   UserName        25453 non-null  object        
 2   Timestamp       25453 non-null  datetime64[ns]
 3   Text            25453 non-null  object        
 4   Embedded_text   25453 non-null  object        
 5   Emojis          25453 non-null  object        
 6   Comments        25453 non-null  float64       
 7   Likes           25453 non-null  float64       
 8   Retweets        25453 non-null  float64       
 9   Image link      25453 non-null  object        
 10  Tweet URL       25453 non-null  object        
 11  source          25453 non-null  object        
 12  tidy_tweets     25453 non-null  object        
 13  sentiment       25453 non-null  object        
 14  neg             25453 non-null  float64       
 15  ne

In [45]:
dfp = tweets_df.pivot_table(index = ['source'], 
                      values = ['tidy_tweets','Likes','Retweets','Comments','compound','neg','neu','pos','sen_pos'],
                      aggfunc = {'tidy_tweets':'count',
                                  'Likes':sum,
                                 'Retweets':sum,
                                 'Comments':sum,
                                 'compound':np.mean,
                                 'neg':np.mean,
                                 'neu':np.mean,
                                 'pos':np.mean,
                           "sen_pos":sum})
dfp['compound'] = dfp['compound'].round(2)
dfp['neu'] = dfp['neu'].round(2)
dfp['pos'] = dfp['pos'].round(2)
dfp['neg'] = dfp['neg'].round(2)

In [46]:
dfp['pos %'] = (dfp['sen_pos']/dfp['tidy_tweets']).round(2)
dfp['comments/tweet'] = (dfp['Comments']/dfp['tidy_tweets']).round(1)
dfp['Like/tweet'] = (dfp['Likes']/dfp['tidy_tweets']).round(1)
dfp['Retweets/tweet'] = (dfp['Retweets']/dfp['tidy_tweets']).round(1)

In [52]:
dfp.columns = [x.capitalize() for x in dfp.columns]
dfp.sort_values('Tidy_tweets',ascending = False)

Unnamed: 0_level_0,Comments,Likes,Retweets,Compound,Neg,Neu,Pos,Sen_pos,Tidy_tweets,Pos %,Comments/tweet,Like/tweet,Retweets/tweet
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
World Cup,137538.0,672487.5,2431346.8,0.27,0.06,0.78,0.16,5128,6294,0.81,21.9,106.8,386.3
Invictus Games,23864.0,123539.0,561015.6,0.26,0.05,0.8,0.15,3770,4503,0.84,5.3,27.4,124.6
EFL League,11018.0,39075.0,279646.7,0.26,0.04,0.82,0.14,3574,4199,0.85,2.6,9.3,66.6
Women's EURO,7057.0,38159.0,212308.7,0.34,0.03,0.81,0.16,3536,3989,0.89,1.8,9.6,53.2
Rainbow Laces,17337.0,17665.0,120921.0,0.37,0.04,0.76,0.2,2227,2524,0.88,6.9,7.0,47.9
women's six nations,1125.0,3853.0,24683.0,0.32,0.03,0.83,0.14,1331,1492,0.89,0.8,2.6,16.5
EFL Play-Off,8129.0,10401.0,47262.0,0.38,0.03,0.79,0.18,722,801,0.9,10.1,13.0,59.0
The Open,42345.0,249248.1,722767.7,0.09,0.08,0.79,0.12,272,365,0.75,116.0,682.9,1980.2
Rugby World Cup,2701.0,12315.0,61932.9,0.28,0.03,0.82,0.15,317,363,0.87,7.4,33.9,170.6
Super Bowl,43034.0,243891.2,734563.1,0.58,0.05,0.58,0.37,339,359,0.94,119.9,679.4,2046.1


In [61]:
df2 = pd.read_csv('sentiment_analysis_score.csv')
df2 = df2.drop(columns = 'Sentiment')
df2

Unnamed: 0,source,Comments,Likes,Retweets,Compound,Tidy_tweets
0,EFL League,0.01953,0.011268,0.029699,0.26,0.655767
1,EFL Play-Off,0.082454,0.016668,0.025976,0.38,0.097437
2,Invictus Games,0.041905,0.037879,0.058123,0.26,0.705718
3,Rainbow Laces,0.055028,0.007881,0.020537,0.37,0.380546
4,Rugby World Cup,0.05981,0.047407,0.080683,0.28,0.025468
5,Super Bowl,1.0,0.994849,1.0,0.58,0.024811
6,The Open,0.967737,1.0,0.967673,0.09,0.025797
7,Winter Paralympics,0.0,0.0,0.0,0.34,0.0
8,Women's EURO,0.012382,0.01165,0.023143,0.34,0.621262
9,World Athletics Championships,0.029604,0.042985,0.063578,0.64,0.024318


In [126]:
def select_event(comments_score, likes_score, retweet_score, compound_score, coverage_score):
    df2['comments_weighted'] = df2['Comments'].apply(lambda x: x*comments_score)
    
    df2['likes_weighted'] = df2['Likes'].apply(lambda x: x*likes_score)

    df2['retweet_weighted'] = df2['Retweets'].apply(lambda x: x*retweet_score)

    df2['compound_weighted'] = df2['Compound'].apply(lambda x: x*compound_score)

    df2['coverage_weighted'] = df2['Tidy_tweets'].apply(lambda x: x*coverage_score)

    df2['final_weighted'] = df2['comments_weighted'] + df2['likes_weighted'] + df2['retweet_weighted'] + df2['compound_weighted'] + df2['coverage_weighted']
    
    return df2.iloc[df2.final_weighted.argmax(),0]

select_event(5,1,4,1,3)
    
    

'Super Bowl'