In [3]:
#Libraries

import pandas as pd
import re 

# Display the tweet fully using:
pd.set_option('display.max_colwidth', None)
pd.set_option('max_colwidth', None)

In [4]:
df = pd.read_csv('Matcha#Dataset.csv')

In [5]:
# to return columns' names of the dataset
df.columns

Index(['Created at', 'User', 'Tweet', 'ID', 'Number of Retweets',
       'Number of likes', 'Source', 'Location'],
      dtype='object')

In [6]:
# to return each column with the corresponding non-null values count and its data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Created at          3348 non-null   object 
 1   User                3348 non-null   object 
 2   Tweet               3348 non-null   object 
 3   ID                  3348 non-null   float64
 4   Number of Retweets  3348 non-null   int64  
 5   Number of likes     3348 non-null   int64  
 6   Source              3348 non-null   object 
 7   Location            2390 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 209.4+ KB


In [7]:
# to return the count of non-null values in each column
df.count()

Created at            3348
User                  3348
Tweet                 3348
ID                    3348
Number of Retweets    3348
Number of likes       3348
Source                3348
Location              2390
dtype: int64

In [8]:
# to return the total number of rows
len(df)

3348

In [9]:
# to return statistical calculations
df.describe()

Unnamed: 0,ID,Number of Retweets,Number of likes
count,3348.0,3348.0,3348.0
mean,1.577715e+18,0.25,3.643967
std,796289700000000.0,2.803906,40.340408
min,1.57625e+18,0.0,0.0
25%,1.57708e+18,0.0,0.0
50%,1.57773e+18,0.0,1.0
75%,1.57839e+18,0.0,2.0
max,1.57914e+18,77.0,2132.0


In [10]:
df.head()

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
0,2022-10-09 15:56:27+00:00,SPStavrosAuthor,"@LynnDiavol I try to drink matcha over coffee, because it helps with anxiety problems instead of making them worse",1.57914e+18,0,0,Twitter for Android,Canada
1,2022-10-09 15:54:44+00:00,dellianarose,Height: 5‚Äô3‚Äù\nAge: 29\nShoe Size: 8 or 9 (depending on the shoe) \nZodiac Sign: Sagittarius \nTattoos: 1 \nPiercings: 2‚Ä¶ https://t.co/VA5nQx4NuK,1.57914e+18,0,0,Twitter for iPhone,"Ottawa, Ontario"
2,2022-10-09 15:36:25+00:00,DeeReads_,drink matcha and sin,1.57913e+18,0,1,Twitter Web App,
3,2022-10-09 14:57:00+00:00,atasteofmadness,"Easy Vanilla Matcha Chai Latte recipe. This healthy warm comforting drink idea is vegan and gluten free, perfect on‚Ä¶ https://t.co/CUpNVNNdbh",1.57912e+18,1,0,Twitter Web App,"Chicago, IL"
4,2022-10-09 14:21:21+00:00,ogidigest,did you drink your oatly matcha latte today? https://t.co/Hb9bu8SWGX,1.57911e+18,0,0,Twitter Web App,


In [11]:
df.shape

(3348, 8)

Data Issues:

In [12]:
# NORMALIZATION PROBLEM

# It's been noticed that some Twitter users use 'Matcha' and sometimes 'Macha' while referring to the same drink.
# Therefore, it will be normalized to 'Matcha' in all tweets.

#This method takes the tweet as the parameter, normalizes the drink's spelling, then returns it.
def normalize(tweet):
    text = re.sub("macha","matcha", tweet)
    return tweet

In [13]:
df['Tweet'] = df['Tweet'].apply(normalize)

In [14]:
#Return a sample of the dataset
df.sample(5)

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
2402,2022-10-08 16:28:12+00:00,mitziixoxo,"Catching up on my tags, thanks for the tag @izeluvnuggetss !! Love uu\n\nHeight = 5‚Äô4‚Äù (abt 163cm)\nZodiac = Sagittari‚Ä¶ https://t.co/ClPJxKcHOp",1.57878e+18,0,1,Twitter for iPhone,"Clowntown, USA"
2253,2022-10-01 20:30:24+00:00,n3komanc3r,@blackgirlsanime hm mirai would order something simple like a matcha latte OR something super convoluted like a smo‚Ä¶ https://t.co/UhCqq9S9vp,1.57631e+18,1,3,Twitter for iPhone,she/her
307,2022-10-05 08:21:53+00:00,zafrialimi,@tneyfatin Hahahaha meaning here that you can drink something with caffeine but not as high as coffee. \n\nYup matcha‚Ä¶ https://t.co/xAzRqs4vYO,1.57757e+18,0,0,Twitter for iPhone,Kuantan
52,2022-10-08 16:36:56+00:00,kayristar,"Hello taggy\n\nHeight: 5'4""\nZodiac: Sagittarius :) \nTattoos: No but if i dare i will someday \nFav colour: purple, bla‚Ä¶ https://t.co/wu8o0O3irQ",1.57879e+18,0,1,Twitter for Android,Dragon's Brew
1640,2022-10-04 11:43:46+00:00,arcalaghost,celebrating a small win today by reading my most fave book + a half full matcha latte (im thirsty im sorry HAHA) https://t.co/PiUNrvM7MY,1.57726e+18,0,19,Twitter for Android,"14.5874¬∞ N, 120.9839¬∞ E"


In [15]:
# To check if 'macha' has been replaced with 'matcha', an attempt to return any tweet that contains the word 'macha' is made
# However, the dataframe returned is empty, which means there aren't any tweets with the word 'macha', and our normalization
# is successful!

df[df['Tweet'].str.contains('macha')]

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
2105,2022-10-09 09:26:00+00:00,Kidding_Jeff,"@peacefulplace I love macha,nearly equal green tea. https://t.co/4iLC3YApmx",1.57904e+18,0,0,Twitter for iPhone,JAPAN
2106,2022-10-09 08:08:13+00:00,SheWhoIsMe01,@LynnDiavol Try the Starbucks iced green (not macha) with Stevia. They shake it and it changes the viscosity a bit.‚Ä¶ https://t.co/2cr6Pq2GNx,1.57902e+18,0,0,Twitter for iPad,
2107,2022-10-09 02:37:40+00:00,EroticEeveeMFC,@JJ1XYZ It IS green tea! Not macha.\n\nMixed with raw honey &amp; oat milk &lt;3,1.57894e+18,0,0,Twitter Web App,Los Angeles
2109,2022-10-08 21:03:50+00:00,Bemcdon01Sr,@LynnDiavol I still drink coffee but I have macha green tea every day too. Straight.,1.57885e+18,0,0,Twitter Web App,
2110,2022-10-08 20:04:54+00:00,ADHD_Cyborg,@RiviereTosh @transientideas I love earl grey tea. You could also try green tea or a macha latte,1.57884e+18,0,2,Twitter for Android,
2111,2022-10-08 17:54:12+00:00,SpaceBard,@LynnDiavol Green tea. I have bough macha once and that was good too. I didn't make it any special way. I did try i‚Ä¶ https://t.co/kQtzcczFhX,1.57881e+18,0,0,Twitter Web App,Chicago
2115,2022-10-07 23:45:02+00:00,Emby1781,"@SHB13426 I'll definitely have to try it out! I think they had some kind of grass tea or a macha tea, so I'll defin‚Ä¶ https://t.co/JZgtniZLnp",1.57853e+18,0,1,Twitter for Android,
2118,2022-10-06 13:38:10+00:00,KingNeptune1220,@Twitch_Lobita I don‚Äôt even drink coffee‚Ä¶ but if I go to star bucks I get the green macha teaüò§ sooooo gooooooddd,1.57802e+18,0,0,Twitter for iPhone,
2127,2022-10-05 04:41:36+00:00,ivalicea,@windupelezen lemon cake with a side of green macha tea! she quite loves that one! üòöüíï,1.57752e+18,0,1,Twitter Web App,Radz-at-Han
2131,2022-10-03 03:12:03+00:00,Gyrowoof,#Waff loves our #macha Green tea! XD https://t.co/fBEZQMBoLu,1.57677e+18,0,0,Twitter for Android,Little red dot


In [16]:
# Here, all of the tweets in the sample contain 'matcha' and not 'macha'
df[df['Tweet'].str.contains('matcha')].sample(10)

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
1569,2022-10-04 16:32:43+00:00,JaniseSky,"@Starbucks I always order matcha latte iced, add oatmilk, 1 shot of espresso &amp; 3pumps SF vanilla! Soo yummyüíöüíö",1.57734e+18,0,0,Twitter for Android,
1029,2022-10-06 23:58:12+00:00,snoopydyke,Drinking a really bad matcha latte,1.57817e+18,0,0,Twitter for iPhone,"Santa Cruz, CA"
3013,2022-10-04 05:26:55+00:00,touhid90278163,TAKE MATCHA TEA TO CONTROL YOUR INCRESS WEIGHT\nhttps://t.co/oSi665XWv4\n#matchaslim\n#matchaslimdm‚Ä¶ https://t.co/2RqYlRxwza,1.57717e+18,0,0,Twitter Web App,Berlin.Berlin
1608,2022-10-04 14:04:03+00:00,hopesnft,"@CaliTopp starbuck‚Äôs coffee is so expensive but has low quality coffee beans, i like their matcha latte tho",1.5773e+18,0,1,Twitter for iPhone,
2338,2022-10-09 01:12:52+00:00,xplayman_live,I have been so thirsty all freaking day! Finally got myself a matcha green tea üçµ https://t.co/vAGqTOAywk,1.57892e+18,0,2,Twitter for iPhone,
944,2022-10-07 13:41:18+00:00,Kurrueche,Not me trying a matcha latte and it tasting like a grass latte üò≠üò≠,1.57838e+18,0,0,Twitter for iPhone,From Japan To DC
3059,2022-10-03 23:38:23+00:00,rahyeesuh,matcha tea w/o that brown sugar syrup is not good &lt;&lt;,1.57708e+18,0,0,Twitter for iPhone,pluto
1892,2022-10-03 00:15:26+00:00,jvckienav,I got cold foam with my matcha latte for the first time and that mf a game changer,1.57673e+18,0,1,Twitter for iPhone,"Tempe, AZ"
2442,2022-10-08 07:56:24+00:00,lc_riley,@JeannaLStars I‚Äôm all about matcha green tea nowadays!,1.57866e+18,0,1,Twitter for iPhone,The World. Like everyone else.
590,2022-10-09 13:15:09+00:00,kaleater,"gm angels üçµ‚òÅÔ∏è\n\nmade a matcha latte after doing yoga this morning &lt;3 ‚Äî‚Äî\n\ni hope u have a beautiful sunday, reminder‚Ä¶ https://t.co/nMvl8ISBOh",1.5791e+18,0,31,Twitter for iPhone,.¬∑:*¬®‡º∫med/high res & i.f‡ºª¬®*:¬∑.


In [17]:
# Find duplicated rows that have the same value in 'Tweet' column
df[df.duplicated(subset='Tweet')].head()

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
559,2022-10-09 15:54:44+00:00,dellianarose,Height: 5‚Äô3‚Äù\nAge: 29\nShoe Size: 8 or 9 (depending on the shoe) \nZodiac Sign: Sagittarius \nTattoos: 1 \nPiercings: 2‚Ä¶ https://t.co/VA5nQx4NuK,1.57914e+18,0,0,Twitter for iPhone,"Ottawa, Ontario"
569,2022-10-09 14:57:00+00:00,atasteofmadness,"Easy Vanilla Matcha Chai Latte recipe. This healthy warm comforting drink idea is vegan and gluten free, perfect on‚Ä¶ https://t.co/CUpNVNNdbh",1.57912e+18,1,0,Twitter Web App,"Chicago, IL"
576,2022-10-09 14:21:21+00:00,ogidigest,did you drink your oatly matcha latte today? https://t.co/Hb9bu8SWGX,1.57911e+18,0,0,Twitter Web App,
588,2022-10-09 13:28:29+00:00,fukuroumochi,"5 tags, 5 faves!\n\nfood: SUKIYAKI!! üç≤ \ndrink: matcha latte üçµ\ncolor: Seijoh color üíöü§ç (is it mint green or turquoise b‚Ä¶ https://t.co/tChOZzN5VX",1.5791e+18,0,2,Twitter Web App,she/her ‚Ä¢ 25+
605,2022-10-09 10:16:36+00:00,piirtelen,thank you for the tag cheeseüíï\n\n5 tags &amp; faves\nFood - i don't have a #1 but would currently love some bbq after fore‚Ä¶ https://t.co/b2L9KzfdAO,1.57905e+18,0,2,Twitter for Android,üá´üáÆ | they/them


In [18]:
# duplicates count
df.duplicated(subset='Tweet').sum()

413

In [19]:
# remving rows that have "RT" in the begining of the tweet
df = df[df["Tweet"].str.contains("RT") == False]

In [20]:
# check if rows that "RT" in the begining of the tweet have been removed
df[df["Tweet"].str.contains("RT")]
# empty dataframe means no tweet exists with such keyword

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location


In [21]:
# check if duplicates were removed
df.duplicated(subset='Tweet').sum()

411

In [22]:
# Display duplicated rows
duplicateRowsDF = df[df.duplicated(subset='Tweet')]
duplicateRowsDF.head(4)

Unnamed: 0,Created at,User,Tweet,ID,Number of Retweets,Number of likes,Source,Location
559,2022-10-09 15:54:44+00:00,dellianarose,Height: 5‚Äô3‚Äù\nAge: 29\nShoe Size: 8 or 9 (depending on the shoe) \nZodiac Sign: Sagittarius \nTattoos: 1 \nPiercings: 2‚Ä¶ https://t.co/VA5nQx4NuK,1.57914e+18,0,0,Twitter for iPhone,"Ottawa, Ontario"
569,2022-10-09 14:57:00+00:00,atasteofmadness,"Easy Vanilla Matcha Chai Latte recipe. This healthy warm comforting drink idea is vegan and gluten free, perfect on‚Ä¶ https://t.co/CUpNVNNdbh",1.57912e+18,1,0,Twitter Web App,"Chicago, IL"
576,2022-10-09 14:21:21+00:00,ogidigest,did you drink your oatly matcha latte today? https://t.co/Hb9bu8SWGX,1.57911e+18,0,0,Twitter Web App,
588,2022-10-09 13:28:29+00:00,fukuroumochi,"5 tags, 5 faves!\n\nfood: SUKIYAKI!! üç≤ \ndrink: matcha latte üçµ\ncolor: Seijoh color üíöü§ç (is it mint green or turquoise b‚Ä¶ https://t.co/tChOZzN5VX",1.5791e+18,0,2,Twitter Web App,she/her ‚Ä¢ 25+


In [23]:
# remove duplicates
df.drop_duplicates(subset='Tweet', inplace=True)

In [24]:
# check if duplicates were removed
df.duplicated(subset='Tweet').sum()

0

In [25]:
# check total number of rows in df
df.shape

(2929, 8)

In [1]:
#This function removes hyperlinks from the tweets
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)

df= df['Text'].apply(remove_URL)



NameError: name 'df' is not defined