# PROJECT TITLE: FINDING THE MOST INTERESTING TWEETS AND FAN SENTIMENT DURING A GAME TELECAST IN REALTIME

# PART 5: TIMEZONE AND TWEET PREPROCESSING:

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('tweets_proc.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,weight,HTs,UMN,UMID,Tweet_Extra,day,mon,hr,mnt
0,Want to make this summer football that little ...,543,"['Win', 'WorldCup']",['Bargain Booze'],['BargainBooze'],"['win', 'worldcup', 'bargain booze', 'bargainb...",29,6,23,56
1,Didier Deschamps has confirmed Benjamin Mendy ...,23,"['FRA', 'ARG']",['City Watch'],['City_Watch'],"['fra', 'arg', 'city watch', 'city_watch']",29,6,23,56
2,Please play Power by EXO EXO also performed du...,1300,[''],"['Aeri Days', 'FIFA World Cup ?']","['aeridays', 'FIFAWorldCup']","['', 'aeri days', 'fifa world cup ?', 'aeriday...",29,6,23,56
3,weeks teams games goals The group stage was ev...,1259,['WorldCup'],['B/R Football'],['brfootball'],"['worldcup', 'b/r football', 'brfootball']",29,6,23,56
4,Please play Power by EXO EXO also performed du...,1300,[''],"['Aeri Days', 'FIFA World Cup ?']","['aeridays', 'FIFAWorldCup']","['', 'aeri days', 'fifa world cup ?', 'aeriday...",29,6,23,56
...,...,...,...,...,...,...,...,...,...,...
529995,Please guys please please help me ouuttt Pleas...,87,[''],"['ShakaZuluOnTheBeats', '#RecognizeMeEp - 10 A...","['K_SUPREME_ZA', 'TBanSA']","['', 'shakazuluonthebeats', '#recognizemeep - ...",15,7,23,14
529996,Dear France Congratulations on winning the of ...,96217,['WorldCup'],['Khaled Beydoun'],['KhaledBeydoun'],"['worldcup', 'khaled beydoun', 'khaledbeydoun']",15,7,23,14
529997,France have won the FIFA in Moscow,63116,"['FRA', 'WorldCup', 'FRACRO', 'WorldCupFinal']",['FIFA World Cup'],['FIFAWorldCup'],"['fra', 'worldcup', 'fracro', 'worldcupfinal',...",15,7,23,14
529998,Islamophobia Xenophobia and racism are global ...,1,[''],['Alex Florack'],['crimsonsith720'],"['', 'alex florack', 'crimsonsith720']",15,7,23,14


## 1. Change Timezone:

In [3]:
temp = []
for i in range(530000):
    temp.append([df['day'][i], df['mon'][i], df['hr'][i], df['mnt'][i]])

# Convert UTC to Russian Local Time (+3 hrs)
for i in temp:
    i[2] += 3
    if i[2] > 23:
        i[2] -= 24
        i[0] += 1
        if i[0] > 30:
            i[0] -= 30
            i[1] += 1

In [4]:
day_arr = []
mon_arr = []
hr_arr = []
mnt_arr = []
for i in temp:
    day_arr.append(i[0])
    mon_arr.append(i[1])
    hr_arr.append(i[2])
    mnt_arr.append(i[3])
df['Day'] = day_arr
df['Mon'] = mon_arr
df['Hr'] = hr_arr
df['Mnt'] = mnt_arr
df.drop(['day', 'mon', 'hr', 'mnt'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,weight,HTs,UMN,UMID,Tweet_Extra,Day,Mon,Hr,Mnt
0,Want to make this summer football that little ...,543,"['Win', 'WorldCup']",['Bargain Booze'],['BargainBooze'],"['win', 'worldcup', 'bargain booze', 'bargainb...",30,6,2,56
1,Didier Deschamps has confirmed Benjamin Mendy ...,23,"['FRA', 'ARG']",['City Watch'],['City_Watch'],"['fra', 'arg', 'city watch', 'city_watch']",30,6,2,56
2,Please play Power by EXO EXO also performed du...,1300,[''],"['Aeri Days', 'FIFA World Cup ?']","['aeridays', 'FIFAWorldCup']","['', 'aeri days', 'fifa world cup ?', 'aeriday...",30,6,2,56
3,weeks teams games goals The group stage was ev...,1259,['WorldCup'],['B/R Football'],['brfootball'],"['worldcup', 'b/r football', 'brfootball']",30,6,2,56
4,Please play Power by EXO EXO also performed du...,1300,[''],"['Aeri Days', 'FIFA World Cup ?']","['aeridays', 'FIFAWorldCup']","['', 'aeri days', 'fifa world cup ?', 'aeriday...",30,6,2,56
...,...,...,...,...,...,...,...,...,...,...
529995,Please guys please please help me ouuttt Pleas...,87,[''],"['ShakaZuluOnTheBeats', '#RecognizeMeEp - 10 A...","['K_SUPREME_ZA', 'TBanSA']","['', 'shakazuluonthebeats', '#recognizemeep - ...",16,7,2,14
529996,Dear France Congratulations on winning the of ...,96217,['WorldCup'],['Khaled Beydoun'],['KhaledBeydoun'],"['worldcup', 'khaled beydoun', 'khaledbeydoun']",16,7,2,14
529997,France have won the FIFA in Moscow,63116,"['FRA', 'WorldCup', 'FRACRO', 'WorldCupFinal']",['FIFA World Cup'],['FIFAWorldCup'],"['fra', 'worldcup', 'fracro', 'worldcupfinal',...",16,7,2,14
529998,Islamophobia Xenophobia and racism are global ...,1,[''],['Alex Florack'],['crimsonsith720'],"['', 'alex florack', 'crimsonsith720']",16,7,2,14


## 2. Preprocess Tweet:

In [5]:
df1 = df['Tweet']
df.Tweet = df.Tweet.fillna('')
tweet_arr = []
for i in df1:
    x = i.split(' ')
    tweet_arr.append(x)

## 2a. Perform Stemming / Lemmetization:

In [6]:
stemmer = SnowballStemmer(language='english')
tweets_stemmed = []
for i in tweet_arr:
    stemmed_list = [stemmer.stem(token) for token in i]
    tweets_stemmed.append(stemmed_list)
tweets_stemmed

[['want',
  'to',
  'make',
  'this',
  'summer',
  'footbal',
  'that',
  'littl',
  'bit',
  'hotter',
  'rt',
  'follow',
  'to'],
 ['didier',
  'deschamp',
  'has',
  'confirm',
  'benjamin',
  'mendi',
  'will',
  'miss',
  'tomorrow',
  'game',
  'for',
  'against',
  'due',
  'to',
  'muscular',
  'problem'],
 ['pleas',
  'play',
  'power',
  'by',
  'exo',
  'exo',
  'also',
  'perform',
  'dure',
  'the',
  'close',
  'ceremoni',
  'of',
  'the',
  'winter',
  'olymp',
  'the',
  'song',
  'is'],
 ['week',
  'team',
  'game',
  'goal',
  'the',
  'group',
  'stage',
  'was',
  'everyth',
  'we',
  'hope',
  'for'],
 ['pleas',
  'play',
  'power',
  'by',
  'exo',
  'exo',
  'also',
  'perform',
  'dure',
  'the',
  'close',
  'ceremoni',
  'of',
  'the',
  'winter',
  'olymp',
  'the',
  'song',
  'is'],
 ['hi',
  'everyon',
  'let',
  'get',
  'start',
  'round',
  'am',
  'so',
  'excit',
  'but',
  'miss',
  'everi',
  'footbal',
  'leagu',
  'that',
  'will',
  'come',
  '

In [7]:
lemmatizer = WordNetLemmatizer()
tweets_lemm = []
for i in tweet_arr:
    lemmatized_list = [lemmatizer.lemmatize(token) for token in i]
    tweets_lemm.append(lemmatized_list)
tweets_lemm

[['Want',
  'to',
  'make',
  'this',
  'summer',
  'football',
  'that',
  'little',
  'bit',
  'hotter',
  'RT',
  'follow',
  'to'],
 ['Didier',
  'Deschamps',
  'ha',
  'confirmed',
  'Benjamin',
  'Mendy',
  'will',
  'miss',
  'tomorrow',
  'game',
  'for',
  'against',
  'due',
  'to',
  'muscular',
  'problem'],
 ['Please',
  'play',
  'Power',
  'by',
  'EXO',
  'EXO',
  'also',
  'performed',
  'during',
  'the',
  'Closing',
  'Ceremony',
  'of',
  'the',
  'Winter',
  'Olympics',
  'The',
  'song',
  'is'],
 ['week',
  'team',
  'game',
  'goal',
  'The',
  'group',
  'stage',
  'wa',
  'everything',
  'we',
  'hoped',
  'for'],
 ['Please',
  'play',
  'Power',
  'by',
  'EXO',
  'EXO',
  'also',
  'performed',
  'during',
  'the',
  'Closing',
  'Ceremony',
  'of',
  'the',
  'Winter',
  'Olympics',
  'The',
  'song',
  'is'],
 ['Hi',
  'everyone',
  'Lets',
  'get',
  'started',
  'Round',
  'am',
  'so',
  'excited',
  'but',
  'miss',
  'every',
  'football',
  'league'

## 2b. Remove Stopwords:

In [8]:
stop = set(stopwords.words('english'))
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [9]:
tweets_no_sw = []
for i in tweets_lemm:
    no_stopwords = [token for token in i if token not in stop]
    tweets_no_sw.append(no_stopwords)
tweets_no_sw

[['Want',
  'make',
  'summer',
  'football',
  'little',
  'bit',
  'hotter',
  'RT',
  'follow'],
 ['Didier',
  'Deschamps',
  'ha',
  'confirmed',
  'Benjamin',
  'Mendy',
  'miss',
  'tomorrow',
  'game',
  'due',
  'muscular',
  'problem'],
 ['Please',
  'play',
  'Power',
  'EXO',
  'EXO',
  'also',
  'performed',
  'Closing',
  'Ceremony',
  'Winter',
  'Olympics',
  'The',
  'song'],
 ['week',
  'team',
  'game',
  'goal',
  'The',
  'group',
  'stage',
  'wa',
  'everything',
  'hoped'],
 ['Please',
  'play',
  'Power',
  'EXO',
  'EXO',
  'also',
  'performed',
  'Closing',
  'Ceremony',
  'Winter',
  'Olympics',
  'The',
  'song'],
 ['Hi',
  'everyone',
  'Lets',
  'get',
  'started',
  'Round',
  'excited',
  'miss',
  'every',
  'football',
  'league',
  'come',
  'World',
  'Cup',
  'well'],
 ['First',
  'night',
  'world',
  'cup',
  'game',
  'chat',
  'missus',
  'She',
  'seems',
  'nice',
  'girl',
  'fair'],
 ['Batshuayi', 'arguably', 'funniest', 'moment'],
 ['globa

## 2c. Convert strings to lowercase:

In [10]:
tweets_preproc = []
for i in tweets_no_sw:
    lower_list = [token.lower() for token in i]
    tweets_preproc.append(lower_list)
tweets_preproc

[['want',
  'make',
  'summer',
  'football',
  'little',
  'bit',
  'hotter',
  'rt',
  'follow'],
 ['didier',
  'deschamps',
  'ha',
  'confirmed',
  'benjamin',
  'mendy',
  'miss',
  'tomorrow',
  'game',
  'due',
  'muscular',
  'problem'],
 ['please',
  'play',
  'power',
  'exo',
  'exo',
  'also',
  'performed',
  'closing',
  'ceremony',
  'winter',
  'olympics',
  'the',
  'song'],
 ['week',
  'team',
  'game',
  'goal',
  'the',
  'group',
  'stage',
  'wa',
  'everything',
  'hoped'],
 ['please',
  'play',
  'power',
  'exo',
  'exo',
  'also',
  'performed',
  'closing',
  'ceremony',
  'winter',
  'olympics',
  'the',
  'song'],
 ['hi',
  'everyone',
  'lets',
  'get',
  'started',
  'round',
  'excited',
  'miss',
  'every',
  'football',
  'league',
  'come',
  'world',
  'cup',
  'well'],
 ['first',
  'night',
  'world',
  'cup',
  'game',
  'chat',
  'missus',
  'she',
  'seems',
  'nice',
  'girl',
  'fair'],
 ['batshuayi', 'arguably', 'funniest', 'moment'],
 ['globa

In [11]:
df['Tweet_Proc'] = tweets_preproc
df

Unnamed: 0,Tweet,weight,HTs,UMN,UMID,Tweet_Extra,Day,Mon,Hr,Mnt,Tweet_Proc
0,Want to make this summer football that little ...,543,"['Win', 'WorldCup']",['Bargain Booze'],['BargainBooze'],"['win', 'worldcup', 'bargain booze', 'bargainb...",30,6,2,56,"[want, make, summer, football, little, bit, ho..."
1,Didier Deschamps has confirmed Benjamin Mendy ...,23,"['FRA', 'ARG']",['City Watch'],['City_Watch'],"['fra', 'arg', 'city watch', 'city_watch']",30,6,2,56,"[didier, deschamps, ha, confirmed, benjamin, m..."
2,Please play Power by EXO EXO also performed du...,1300,[''],"['Aeri Days', 'FIFA World Cup ?']","['aeridays', 'FIFAWorldCup']","['', 'aeri days', 'fifa world cup ?', 'aeriday...",30,6,2,56,"[please, play, power, exo, exo, also, performe..."
3,weeks teams games goals The group stage was ev...,1259,['WorldCup'],['B/R Football'],['brfootball'],"['worldcup', 'b/r football', 'brfootball']",30,6,2,56,"[week, team, game, goal, the, group, stage, wa..."
4,Please play Power by EXO EXO also performed du...,1300,[''],"['Aeri Days', 'FIFA World Cup ?']","['aeridays', 'FIFAWorldCup']","['', 'aeri days', 'fifa world cup ?', 'aeriday...",30,6,2,56,"[please, play, power, exo, exo, also, performe..."
...,...,...,...,...,...,...,...,...,...,...,...
529995,Please guys please please help me ouuttt Pleas...,87,[''],"['ShakaZuluOnTheBeats', '#RecognizeMeEp - 10 A...","['K_SUPREME_ZA', 'TBanSA']","['', 'shakazuluonthebeats', '#recognizemeep - ...",16,7,2,14,"[please, guy, please, please, help, ouuttt, pl..."
529996,Dear France Congratulations on winning the of ...,96217,['WorldCup'],['Khaled Beydoun'],['KhaledBeydoun'],"['worldcup', 'khaled beydoun', 'khaledbeydoun']",16,7,2,14,"[dear, france, congratulations, winning, team,..."
529997,France have won the FIFA in Moscow,63116,"['FRA', 'WorldCup', 'FRACRO', 'WorldCupFinal']",['FIFA World Cup'],['FIFAWorldCup'],"['fra', 'worldcup', 'fracro', 'worldcupfinal',...",16,7,2,14,"[france, fifa, moscow]"
529998,Islamophobia Xenophobia and racism are global ...,1,[''],['Alex Florack'],['crimsonsith720'],"['', 'alex florack', 'crimsonsith720']",16,7,2,14,"[islamophobia, xenophobia, racism, global, phi..."


In [12]:
df0 = df.loc[df['Day'] == 30]
df1 = df.loc[df['Day'] == 1]
df2 = df.loc[df['Day'] == 2]
df3 = df.loc[df['Day'] == 3]
df4 = df.loc[df['Day'] == 4]
df5 = df.loc[df['Day'] == 5]
df6 = df.loc[df['Day'] == 6]
df7 = df.loc[df['Day'] == 7]
df8 = df.loc[df['Day'] == 8]
df9 = df.loc[df['Day'] == 9]
df10 = df.loc[df['Day'] == 10]
df11 = df.loc[df['Day'] == 11]
df12 = df.loc[df['Day'] == 12]
df13 = df.loc[df['Day'] == 13]
df14 = df.loc[df['Day'] == 14]
df15 = df.loc[df['Day'] == 15]
df16 = df.loc[df['Day'] == 16]

In [13]:
df0.to_csv('tweet_3006.csv')
df1.to_csv('tweet_0107.csv')
df2.to_csv('tweet_0207.csv')
df3.to_csv('tweet_0307.csv')
df4.to_csv('tweet_0407.csv')
df5.to_csv('tweet_0507.csv')
df6.to_csv('tweet_0607.csv')
df7.to_csv('tweet_0707.csv')
df8.to_csv('tweet_0807.csv')
df9.to_csv('tweet_0907.csv')
df10.to_csv('tweet_1007.csv')
df11.to_csv('tweet_1107.csv')
df12.to_csv('tweet_1207.csv')
df13.to_csv('tweet_1307.csv')
df14.to_csv('tweet_1407.csv')
df15.to_csv('tweet_1507.csv')
df16.to_csv('tweet_1607.csv')