In [46]:
import polars as pl
from nltk.corpus import stopwords
import nltk
import re
import demoji

In [47]:
# Load Stopwords and the dataset
nltk.download('stopwords')
ensw = stopwords.words('english')

df = pl.read_csv("Complete_Data.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\helpkreios\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
# Clean ID Part
df = df.select([
    pl.col("ID").apply(lambda id: id.split("-")[1]).cast(pl.Int32),
    pl.exclude("ID")
])

In [49]:
# Remove rows without tweet AND tweet URL
df = df.filter(pl.col("Tweet").is_not_null() & pl.col("Tweet URL").is_not_null())


In [51]:
df1=df.select(
    pl.col("ID"),
    pl.col("Account handle"),
    pl.col("Following"),
    pl.col("Followers"),
    pl.col("Account type")
)

colors = {
    'Identified': 'blue',
    'Media': 'orange',
    'Anonymous': 'red'
}
print(df1)

import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()

for account_type, color in colors.items():
    filtered_df = df.filter(pl.col('Account type') == account_type)
    fig.add_trace(go.Scatter(
        x=filtered_df['Following'],
        y=filtered_df['Followers'],
        mode='markers',
        text=filtered_df['Account handle'],
        hovertemplate='<b>%{text}</b><br>Following: %{x}<br>Followers: %{y}',
        marker=dict(
            size=10,
            color=color,
            opacity=0.7,
            line=dict(width=0.5, color='black')
        ),
        name=account_type  # Specify the name for each trace
    ))

# Customize the layout
fig.update_layout(
    title='Following vs Followers',
    xaxis_title='Following',
    yaxis_title='Followers',
    hoverlabel=dict(bgcolor='white', font_size=12),
    plot_bgcolor='white',
    legend_title='Account Type'
)




shape: (153, 5)
┌─────┬──────────────────┬───────────┬───────────┬──────────────┐
│ ID  ┆ Account handle   ┆ Following ┆ Followers ┆ Account type │
│ --- ┆ ---              ┆ ---       ┆ ---       ┆ ---          │
│ i32 ┆ str              ┆ i64       ┆ i64       ┆ str          │
╞═════╪══════════════════╪═══════════╪═══════════╪══════════════╡
│ 1   ┆ @arthur_vee      ┆ 2767      ┆ 4255      ┆ Anonymous    │
│ 2   ┆ @ItsJamMagno     ┆ 0         ┆ 76100     ┆ Identified   │
│ 3   ┆ @Brad_Kardinal   ┆ 1787      ┆ 2067      ┆ Anonymous    │
│ 4   ┆ @WinwinEklabu    ┆ 6462      ┆ 7082      ┆ Anonymous    │
│ …   ┆ …                ┆ …         ┆ …         ┆ …            │
│ 150 ┆ @MDSOnwardPH22   ┆ 7966      ┆ 12300     ┆ Anonymous    │
│ 151 ┆ @bivoc1          ┆ 5         ┆ 0         ┆ Anonymous    │
│ 152 ┆ @ManolitoMantal2 ┆ 20        ┆ 12        ┆ Anonymous    │
│ 153 ┆ @thatsthetweet00 ┆ 2032      ┆ 1117      ┆ Anonymous    │
└─────┴──────────────────┴───────────┴───────────┴──────────

In [57]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns

df2 = df.select(
    pl.col("ID"),
    pl.col("Account handle"),
    pl.col("Followers"),
    pl.col("Likes"),
    pl.col("Replies"),
    pl.col("Retweets"),
    pl.col("Quote Tweets")
)

df2.fill_null(0)
df2




ID,Account handle,Followers,Likes,Replies,Retweets,Quote Tweets
i32,str,i64,i64,i64,i64,i64
1,"""@arthur_vee""",4255,259,,94,6
2,"""@ItsJamMagno""",76100,1231,,271,21
3,"""@Brad_Kardinal…",2067,188,,78,15
4,"""@WinwinEklabu""",7082,29,,10,
5,"""@cierloX6""",532,55,,17,1
6,"""@LelangmoVlogg…",257,0,0,0,0
7,"""@iskonglasalis…",6097,5,0,0,0
8,"""@mendozadino""",37,0,,0,
9,"""@dTnalakRiderS…",3275,6,,1,0
10,"""@SLaKaTor""",252,0,,0,0


In [None]:
# Compute the correlation matrix
corr_matrix = df2.to_pandas().corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
plt.title("Correlation Heatmap")
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()

In [None]:
# Include only necessary columns
df = df.select(
    pl.col("ID"),
    pl.col("Tweet"),
    pl.col("Tweet Translated").alias("Translated")
)

In [None]:
df

In [None]:
# Change tweet case to lowercase
df = df.select(
    pl.all(),
    pl.col("Translated").apply(lambda tweet: tweet.lower()).alias("Clean")
)

In [None]:
df

In [None]:
# Remove hashtags
df = df.select(
    pl.exclude("Clean"),
    pl.col("Clean").apply(lambda tweet: re.sub("#(\w+)", '', tweet))
)

In [None]:
# Replace all emojis into interpretation
def emoji_to_word(tweet):
  for symbol, interpretation in demoji.findall(tweet).items():
    interpretation = interpretation.lower()
    # Turn flag: Philippines into flagphilippines 
    interpretation = re.sub('[^0-9a-z]+', '', interpretation)
    # replace all emojis to "emojiinterpretation "
    tweet = re.sub(symbol, interpretation+' ', tweet)
  return tweet

df = df.select(
    pl.exclude("Clean"),
    pl.col("Clean").apply(emoji_to_word)
)

In [None]:
# Remove non alphanumeric characters
df = df.select(
    pl.exclude("Clean"),
    pl.col("Clean").apply(lambda tweet: re.sub('[^0-9a-z]+', ' ', tweet))
)

In [None]:
# Cast Tweets to word array instead of long string.
df = df.select(
    pl.all(),
    pl.col("Clean").apply(lambda tweet: tweet.split()).cast(pl.List(str)).alias("Tokenized")
)

In [None]:
# Strip english stopwords
df = df.select(
    pl.all(),
    pl.col("Tokenized").arr.eval(pl.element().filter(~pl.element().is_in(ensw)), parallel=True).alias("Stopwords Removed")
)

In [None]:
# Stem and Lemmatize.
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df = df.select(
    pl.all(),
    pl.col("Stopwords Removed").apply(lambda words: [stemmer.stem(word) for word in words.to_list()]).alias("Stemmed"),
    pl.col("Stopwords Removed").apply(lambda words: [lemmatizer.lemmatize(word) for word in words.to_list()]).alias("Lemmatized")
)

In [None]:
print(df)
def serialize(arr):
  return " ".join(arr)
df_serialized = df.select(
    pl.col("ID"),
    pl.col("Tweet"),
    pl.col("Translated"),
    pl.col("Clean"),
    pl.col("Tokenized").apply(serialize).cast(str),
    pl.col("Stopwords Removed").apply(serialize).cast(str),
    pl.col("Stemmed").apply(serialize).cast(str),
    pl.col("Lemmatized").apply(serialize).cast(str)
)
df_serialized.write_csv("clean.csv")