<a href="https://colab.research.google.com/github/NikkiYng/Ex1/blob/main/Exercise_1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#  **Testing zipf's law of abbreviation on Tweets**

Weibo posts were scraped with key word "单身即地狱" (Single's inferno)
(see in the other notebook in the github repo)

tweets were imported from Kraggle with the topic "Squid Game" https://www.kaggle.com/datasets/deepcontractor/squid-game-netflix-twitter-data

In [83]:
# Load libraries
from importlib.util import find_spec

import pandas as pd
import numpy as np

import spacy
import en_core_web_sm

import jieba
import re
from collections import Counter
import nltk
from nltk.probability import FreqDist

**Download data**

In [84]:
# Prepare the English tweets data

tweets_url = 'https://drive.google.com/uc?export=download&id=1QN6t-o_jLl8vA8qD04xXYQXhRHX96dMy'
all_twts_df = pd.read_csv(tweets_url) # the imported twitter data is a raw file with all the unnecessary columns
text1000_twts_df = all_twts_df[['text']].head(1000) # Extract the 'text' column and the first 1000 rows
print(text1000_twts_df)


# Prepare the Chinese weibo data
weibo_url = 'https://drive.google.com/uc?export=download&id=1xPJUJ5l41MUpp4yztiyglxYjKOh6diAk'
wb_df = pd.read_csv(weibo_url)# the imported weibo data is in a csv file that only contains the text column
print(wb_df)


                                                  text
0    When life hits and the same time poverty strik...
1    That marble episode of #SquidGame  ruined me. 😭😭😭
2                                      #Squidgame time
3    //Blood on 1st slide\nI'm joining the squidgam...
4    The two first games, players were killed by th...
..                                                 ...
995               Starting episode 9 of #SquidGame now
996  LET ME BE YOUR WOMAN !\n\n#SquidGame #squidgam...
997  What if everyone on the unemployment fraud lis...
998  I think I have downloaded the wrong Squid game...
999  Anyone here finished watching #SquidGame? 👀 ht...

[1000 rows x 1 columns]
                                                  微博正文
0                          单身即地狱3第五期我真的笑喜不愧是韩女#单身即地狱3#
1                          前天一口气看了七期#单身即地狱3#什么时候更新啊着急看
2                                       看单身即地狱3好喜欢巧克力哥
3    1.1看了最新的单身即地狱吃了酸豆角炒肉好久没吃米饭了明天开始工作加油吃了褪黑素准备睡觉效果...
4                             一向不怎麼喜歡戀綜但

**Lemmatize English tweets**

In [85]:
# Convert to txt file
text1000_twts_df.to_csv('tweet.txt', index=False, header=None, sep='\t') #tweets
wb_df.to_csv('wb.txt', index=False, header=None, sep='\t') #weibo

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize an empty list
lemmatized_tweets = []

# Read the tweet.txt file line by line and lemmatize
with open('tweet.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Process each line to get a Doc object
        doc = nlp(line.strip())
        # Extract the lemma for each token and filter out punctuation and whitespace
        lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
        lemmatized_tweets.append(" ".join(lemmas))

# Save the lemmatized lines to a new file
with open('lemmatized_tweet.txt', 'w', encoding='utf-8') as file:
    for line in lemmatized_tweets:
        file.write(line + "\n")

**Tokenize the txts**

In [86]:
# Tokenize the txt files
with open('lemmatized_tweet.txt', 'r', encoding='utf-8') as file:
    tweets = [word for line in file for word in line.strip().split()]  #tweets

with open('wb.txt', 'r', encoding='utf-8') as file:
    wb_posts = [word for line in file for word in jieba.cut(line.strip())] #weibo

type(tweets)
type(wb_posts)

list

**Data cleaning**

In [87]:
# Define cleaning functions with regex

# Remove emails
def remove_emails(text):
    return re.sub(r'[A-Za-z0-9+_]+@[A-Za-z0-9+_]+\.[A-Za-z0-9+_]+', '', text)

# Remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# Remove hashtags
def remove_hashtags(text):
    return re.sub(r'#\S+', '', text)

# Remove punctuation/special characters
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

# Remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)


In [88]:
# Process the English Tweets
tweets = [word for word in tweets if word != ''] # remove spaces
tweets = [word for word in tweets if word != 'squidgame'] # remove the shows name
tweets = [word for word in tweets if word != 'Game']

def process(text): # the final combination of all
    text = text.lower()
    text = remove_hashtags(text)
    text = remove_emails(text)
    text = remove_urls(text)
    text = remove_special_characters(text)
    text = remove_html_tags(text)
    return text.strip()

processed_tweets = [process(tweet) for tweet in tweets]
tweets = processed_tweets # replace the original tweets list with the processed one

tweets = [word for word in tweets if word != '']
print(tweets)

['when', 'life', 'hit', 'and', 'the', 'same', 'time', 'poverty', 'strike', 'you', 'gong', 'yoo', 'lets', 'play', 'a', 'game', 'squidgame', 'netflix', 'that', 'marble', 'episode', 'of', 'squidgame', 'ruin', 'i', 'time', 'blood', 'on', '1st', 'slide', 'i', 'be', 'join', 'the', 'thing', 'i', 'be', 'already', 'dead', 'by', 'sugar', 'honeycomb', 'ofc', 'squidgame', 'the', 'two', 'first', 'game', 'player', 'be', 'kill', 'by', 'the', 'mask', 'guy', 'the', 'bloody', 'night', 'and', 'the', 'third', 'game', 'they', 'kill', 'each', 'o', 'thg', 'go', 'to', 'explode', 'to', '4b', 'marketcap', 'very', 'soon', 'the', 'world', 'first', 'moba', 'this', 'game', 'be', 'on', 'another', 'level', 'kardiachain', 'bhundredhyun', 'pls', 'use', 'that', 'gun', 'on', 'i', 'baekhyun', 'exo', 'weareoneexo', 'please', 'vote', 'in', 'my', 'daily', 'poll', 'thank', 'do', 'you', 'think', 'donny', 'van', 'de', 'beek', 'should', 'leave', 'manchester', 'united', 'yes', 'or', 'no', 'i', 'have', 'see', 'bi', 'lingual', 'kor

In [89]:
# Process the Chinese weibo
wb_posts = [word for word in wb_posts if word != ''] # remove spaces
wb_posts = [word for word in wb_posts if word != '单身'] # remove the shows name
wb_posts = [word for word in wb_posts if word != '地狱']


cleaned_wb_posts = []
for post in wb_posts:
    post = remove_emails(post)
    post = remove_urls(post)
    post = remove_hashtags(post)
    post = remove_html_tags(post)
    post = re.sub(r'[^\w\s\u4e00-\u9fff]', '', post)# avoid using remove_special_characters bcuz it removes Chinese characters
    segmented_post = ' '.join(jieba.cut(post))
    cleaned_wb_posts.append(segmented_post)

wb_posts = cleaned_wb_posts # replace the original tweets list with the processed one
print(wb_posts)

['即', '3', '第五期', '我', '真的', '笑', '喜', '不愧', '是', '韩女', '', '即', '3', '', '前天', '一口气', '看', '了', '七期', '', '即', '3', '', '什么', '时候', '更新', '啊', '着急', '看', '看', '即', '3', '好', '喜欢', '巧克力', '哥', '11', '看', '了', '最新', '的', '即', '吃', '了', '酸豆角', '炒肉', '好久没', '吃', '米饭', '了', '明天', '开始', '工作', '加油', '吃', '了', '褪黑素', '准备', '睡觉', '效果', '太好了', '吃', '了', '后', '十几分钟', '已经', '困得', '跟', '狗', '一样', '了', '一向', '不', '怎麼', '喜', '歡戀', '綜但', '單身', '即', '地獄', '3', '作為', '喜劇', '看', '挺', '不錯', '现实', 'BE', '呜呜', '呜心', '好', '痛', '', '', '恋综', '', '即', '', '宋智雅', '', 'L', '男女', '那么些', '事', '的', '微博', '视频', '想', '看', '即', '和', '换乘', '恋爱', '了', '', '', '剪辑', '太', '好看', '第一次', '想', '看恋', '综', '有点', '小', '期待', '比起', '伊甸园', '和', '即', '换乘', '恋爱', '还是', '换乘', '恋爱', '啊', '刷到', '即', '三未播', '花絮', '', '有', '一点点', '能', 'get', '到', '男五', '了', '', '但是', '我', '还是', '磕', '男', '一女', '一', '', '在', '游轮', '上', '哪里', '也', '太 美', '了', '', '再次', '感叹', '好', '喜欢', '女一奎利', '', '太 美', '了', '', '就是', '镜头', '太', '少', '', '还有', '我们', '狐狸',

**Test for the law**

In [90]:

#English tweets
print("English tweets:")
freq_book1 = Counter(tweets)
print("top 10")
print(freq_book1.most_common(10))
print("last 10")
print(freq_book1.most_common()[-10:])

#Chinese weibo
print("Chinese weibo:")
freq_book2 = Counter(wb_posts)
print("top 10")
print(freq_book2.most_common(10))
print("last 10")
print(freq_book2.most_common()[-10:])


English tweets:
top 10
[('be', 523), ('squidgame', 480), ('the', 459), ('i', 409), ('to', 222), ('of', 198), ('a', 187), ('you', 181), ('it', 170), ('and', 168)]
last 10
[('indiain', 1), ('stick', 1), ('bridge', 1), ('bbnajia', 1), ('emmarose', 1), ('count', 1), ('unemployment', 1), ('fraud', 1), ('place', 1), ('punishment', 1)]
Chinese weibo:
top 10
[('', 5619), ('的', 1777), ('了', 1052), ('即', 1043), ('我', 689), ('3', 514), ('是', 454), ('看', 385), ('在', 326), ('真的', 300)]
last 10
[('大戏', 1), ('型哥', 1), ('不活', 1), ('性缩', 1), ('力哥', 1), ('快速', 1), ('分着', 1), ('奥巧', 1), ('周姐', 1), ('糖葫芦', 1)]
