In [8]:
#Libraries

import pandas as pd
import re

# Display the tweet fully using:
pd.set_option('display.max_colwidth', None)
pd.set_option('max_colwidth', None)

In [11]:
df = pd.read_csv('Matcha#Dataset.csv')

In [12]:
# to return columns' names of the dataset
df.columns

Index(['Created at', 'User', 'Tweet', 'ID', 'Number of Retweets',
       'Number of likes', 'Source', 'Location'],
      dtype='object')

In [13]:
# to return each column with the corresponding non-null values count and its data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Created at          3348 non-null   object 
 1   User                3348 non-null   object 
 2   Tweet               3348 non-null   object 
 3   ID                  3348 non-null   float64
 4   Number of Retweets  3348 non-null   int64  
 5   Number of likes     3348 non-null   int64  
 6   Source              3348 non-null   object 
 7   Location            2390 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 209.4+ KB


In [14]:
# to return the count of non-null values in each column
df.count()

Created at            3348
User                  3348
Tweet                 3348
ID                    3348
Number of Retweets    3348
Number of likes       3348
Source                3348
Location              2390
dtype: int64

In [15]:
# to return the total number of rows
len(df)

3348

In [16]:
# to return statistical calculations
df.describe()

Unnamed: 0,ID,Number of Retweets,Number of likes
count,3348.0,3348.0,3348.0
mean,1.577715e+18,0.25,3.643967
std,796289700000000.0,2.803906,40.340408
min,1.57625e+18,0.0,0.0
25%,1.57708e+18,0.0,0.0
50%,1.57773e+18,0.0,1.0
75%,1.57839e+18,0.0,2.0
max,1.57914e+18,77.0,2132.0


In [17]:
df.head()

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
0,2022-10-09 15:56:27+00:00,SPStavrosAuthor,"@LynnDiavol I try to drink matcha over coffee, because it helps with anxiety problems instead of making them worse",1.57914e+18,0,0,Twitter for Android,Canada
1,2022-10-09 15:54:44+00:00,dellianarose,Height: 5‚Äô3‚Äù\nAge: 29\nShoe Size: 8 or 9 (depending on the shoe) \nZodiac Sign: Sagittarius \nTattoos: 1 \nPiercings: 2‚Ä¶ https://t.co/VA5nQx4NuK,1.57914e+18,0,0,Twitter for iPhone,"Ottawa, Ontario"
2,2022-10-09 15:36:25+00:00,DeeReads_,drink matcha and sin,1.57913e+18,0,1,Twitter Web App,
3,2022-10-09 14:57:00+00:00,atasteofmadness,"Easy Vanilla Matcha Chai Latte recipe. This healthy warm comforting drink idea is vegan and gluten free, perfect on‚Ä¶ https://t.co/CUpNVNNdbh",1.57912e+18,1,0,Twitter Web App,"Chicago, IL"
4,2022-10-09 14:21:21+00:00,ogidigest,did you drink your oatly matcha latte today? https://t.co/Hb9bu8SWGX,1.57911e+18,0,0,Twitter Web App,


In [18]:
df.shape

(3348, 8)

Data Issues:

In [19]:
# NORMALIZATION PROBLEM

# It's been noticed that some Twitter users use 'Matcha' and sometimes 'Macha' while referring to the same drink.
# Therefore, it will be normalized to 'Matcha' in all tweets.

#This method takes the tweet as the parameter, normalizes the drink's spelling, then returns it.
def normalize(tweet):
    text = re.sub("macha","matcha", tweet)
    return tweet

In [20]:
df['Tweet'] = df['Tweet'].apply(normalize)

In [21]:
#Return a sample of the dataset
df.sample(5)

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
3012,2022-10-04 05:46:06+00:00,dsemumi,@tabimori iced matcha tea latte w/ oatmilk and the brown sugar espresso the other person mentioned are undefeated.‚Ä¶ https://t.co/r0vI1IaIfh,1.57717e+18,0,1,Twitter for iPhone,seattle
1761,2022-10-03 18:24:03+00:00,lalisarubyjanes,@lrdhive iced chai latte with a shot of espresso or iced matcha latte with oat milk and 2 pumps of chai,1.577e+18,0,1,Twitter for iPhone,he/him 21
1682,2022-10-04 06:17:41+00:00,sngo1982,"I so miss @KurtGeiger shoes in #melbourne &amp; had an iced #matcha #greentea latte in the city, just doing some lite w‚Ä¶ https://t.co/1jfJlWGlFz",1.57718e+18,0,0,Twitter for iPhone,
467,2022-10-03 07:09:59+00:00,yanahluixz,no bc why am i even craving ung matcha drink sa sb???! i have never even ordered it before ü§° tanginang takaw tingin toh,1.57683e+18,0,2,Twitter for iPhone,shaik
617,2022-10-09 07:49:17+00:00,febsal_,trying to come to terms w the fact that i will never love another matcha drink than the last matcha latte recipe fr‚Ä¶ https://t.co/n1UbWgcMeO,1.57902e+18,0,1,Twitter for Android,


In [22]:
# To check if 'macha' has been replaced with 'matcha', an attempt to return any tweet that contains the word 'macha' is made
# However, the dataframe returned is empty, which means there aren't any tweets with the word 'macha', and our normalization
# is successful!

df[df['Tweet'].str.contains('macha')]

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location


In [25]:
# Here, all of the tweets in the sample contain 'matcha' and not 'macha'
df[df['Tweet'].str.contains('matcha')].sample(10)

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
248,2022-10-05 21:55:40+00:00,DrakesCupcake,@ThegirlJT Venti pink drink \nVanilla cold foam \nAnd a scoop of matcha powder,1.57778e+18,0,0,Twitter for iPhone,"Palm Beach, FL"
1732,2022-10-03 22:03:24+00:00,___kristieK,I want a matcha latte üôÉ,1.57706e+18,0,0,Twitter for iPhone,
428,2022-10-03 19:45:00+00:00,Obscure_Occult,@smuttysunflower Not yet but starbucks has a hocus pocus themed drink its pretty good if you like matcha.,1.57702e+18,0,0,Twitter for Android,
2327,2022-10-09 03:36:49+00:00,wahinertraveler,"@LynnDiavol Darjeeling, Earl Grey if accompanied by a salty snack, Roiboosüåç, @ times ginger tea, + matchaüíö in all f‚Ä¶ https://t.co/bdywHqZuib",1.57895e+18,0,2,Twitter Web App,
2096,2022-10-01 17:40:15+00:00,NicolePajer,Had a matcha tea latte at Starbucks yesterday (haven't had one in so long b/c of dairy but they now have coconut mi‚Ä¶ https://t.co/DLKo6h91pH,1.57627e+18,0,8,Twitter Web App,
848,2022-10-07 22:20:49+00:00,brittneytran01,Let my coworker who‚Äôs a coffee drinker try my matcha latte from Starbucks for the first time and SHE LOVED IT. Matcha girlies ü´∂üèºüíö,1.57851e+18,0,2,Twitter for iPhone,
2910,2022-10-04 21:12:53+00:00,f1girlelle,"five tags, five favs\n\nFood: quesadillas, tacos\nDrink: matcha tea \nColor: baby blue, green \nSeason: winter \nShow: Th‚Ä¶ https://t.co/XXX7ygizHf",1.57741e+18,0,5,Twitter for iPhone,CL 16|CS5 5|MV 1|SP 11|DR 3|LN
876,2022-10-07 18:59:09+00:00,immoniquerenee,Woke up this morning and decided to take a walk instead of driving to get my matcha latte ‚Ä¶,1.57846e+18,0,3,Twitter for iPhone,"Hermosa Beach, CA"
187,2022-10-06 17:37:27+00:00,hehe_xd333,@swordgirlfriend drink matcha or yerba,1.57808e+18,0,0,Twitter Web App,Europe
1157,2022-10-06 12:25:42+00:00,Yukikoona,"I made matcha latte and it taste pretty good, i was surprised lmao https://t.co/4zZzoFj5Dg",1.578e+18,1,2,Twitter for Android,Kevin's boobs
