<a href="https://colab.research.google.com/github/NikkiYng/Ex1/blob/main/Exercise_1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#  **Testing zipf's law of abbreviation on Tweets**

Weibo posts were scraped with key word "ÂçïË∫´Âç≥Âú∞Áã±" (Single's inferno)
(see in the other notebook in the github repo)

tweets were imported from Kraggle with the topic "Squid Game" https://www.kaggle.com/datasets/deepcontractor/squid-game-netflix-twitter-data

In [1]:
# Load libraries
from importlib.util import find_spec

import pandas as pd
import numpy as np

import spacy
import en_core_web_sm

import jieba
import re
from collections import Counter
import nltk
from nltk.probability import FreqDist

**Download data**

In [2]:
# Prepare the English tweets data

tweets_url = 'https://drive.google.com/uc?export=download&id=1QN6t-o_jLl8vA8qD04xXYQXhRHX96dMy'
all_twts_df = pd.read_csv(tweets_url) # the imported twitter data is a raw file with all the unnecessary columns
text1000_twts_df = all_twts_df[['text']].head(1000) # Extract the 'text' column and the first 1000 rows
print(text1000_twts_df)


# Prepare the Chinese weibo data
weibo_url = 'https://drive.google.com/uc?export=download&id=1xPJUJ5l41MUpp4yztiyglxYjKOh6diAk'
wb_df = pd.read_csv(weibo_url)# the imported weibo data is in a csv file that only contains the text column
print(wb_df)


                                                  text
0    When life hits and the same time poverty strik...
1    That marble episode of #SquidGame  ruined me. üò≠üò≠üò≠
2                                      #Squidgame time
3    //Blood on 1st slide\nI'm joining the squidgam...
4    The two first games, players were killed by th...
..                                                 ...
995               Starting episode 9 of #SquidGame now
996  LET ME BE YOUR WOMAN !\n\n#SquidGame #squidgam...
997  What if everyone on the unemployment fraud lis...
998  I think I have downloaded the wrong Squid game...
999  Anyone here finished watching #SquidGame? üëÄ ht...

[1000 rows x 1 columns]
                                                  ÂæÆÂçöÊ≠£Êñá
0                          ÂçïË∫´Âç≥Âú∞Áã±3Á¨¨‰∫îÊúüÊàëÁúüÁöÑÁ¨ëÂñú‰∏çÊÑßÊòØÈü©Â•≥#ÂçïË∫´Âç≥Âú∞Áã±3#
1                          ÂâçÂ§©‰∏ÄÂè£Ê∞îÁúã‰∫Ü‰∏ÉÊúü#ÂçïË∫´Âç≥Âú∞Áã±3#‰ªÄ‰πàÊó∂ÂÄôÊõ¥Êñ∞ÂïäÁùÄÊÄ•Áúã
2                                   

**Lemmatize English tweets**

In [3]:
# Convert to txt file
text1000_twts_df.to_csv('tweet.txt', index=False, header=None, sep='\t') #tweets
wb_df.to_csv('wb.txt', index=False, header=None, sep='\t') #weibo

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize an empty list
lemmatized_tweets = []

# Read the tweet.txt file line by line and lemmatize
with open('tweet.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Process each line to get a Doc object
        doc = nlp(line.strip())
        # Extract the lemma for each token and filter out punctuation and whitespace
        lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
        lemmatized_tweets.append(" ".join(lemmas))

# Save the lemmatized lines to a new file
with open('lemmatized_tweet.txt', 'w', encoding='utf-8') as file:
    for line in lemmatized_tweets:
        file.write(line + "\n")

**Tokenize the txts**

In [4]:
# Tokenize the txt files
with open('lemmatized_tweet.txt', 'r', encoding='utf-8') as file:
    tweets = [word for line in file for word in line.strip().split()]  #tweets

with open('wb.txt', 'r', encoding='utf-8') as file:
    wb_posts = [word for line in file for word in jieba.cut(line.strip())] #weibo

type(tweets)
type(wb_posts)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.760 seconds.
DEBUG:jieba:Loading model cost 0.760 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


list

**Data cleaning**

In [5]:
# Define cleaning functions with regex

# Remove emails
def remove_emails(text):
    return re.sub(r'[A-Za-z0-9+_]+@[A-Za-z0-9+_]+\.[A-Za-z0-9+_]+', '', text)

# Remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# Remove hashtags
def remove_hashtags(text):
    return re.sub(r'#\S+', '', text)

# Remove punctuation/special characters
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

# Remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)


In [6]:
# Process the English Tweets
tweets = [word for word in tweets if word != ''] # remove spaces
tweets = [word for word in tweets if word != '' and word.lower() != 'squidgame'] # remove the shows
def process(text): # the final combination of all
    text = text.lower()
    text = remove_hashtags(text)
    text = remove_emails(text)
    text = remove_urls(text)
    text = remove_special_characters(text)
    text = remove_html_tags(text)
    return text.strip()

processed_tweets = [process(tweet) for tweet in tweets]
tweets = processed_tweets # replace the original tweets list with the processed one

tweets = [word for word in tweets if word != '']
print(tweets)

['when', 'life', 'hit', 'and', 'the', 'same', 'time', 'poverty', 'strike', 'you', 'gong', 'yoo', 'lets', 'play', 'a', 'game', 'netflix', 'that', 'marble', 'episode', 'of', 'ruin', 'i', 'time', 'blood', 'on', '1st', 'slide', 'i', 'be', 'join', 'the', 'thing', 'i', 'be', 'already', 'dead', 'by', 'sugar', 'honeycomb', 'ofc', 'the', 'two', 'first', 'game', 'player', 'be', 'kill', 'by', 'the', 'mask', 'guy', 'the', 'bloody', 'night', 'and', 'the', 'third', 'game', 'they', 'kill', 'each', 'o', 'thg', 'go', 'to', 'explode', 'to', '4b', 'marketcap', 'very', 'soon', 'the', 'world', 'first', 'moba', 'this', 'game', 'be', 'on', 'another', 'level', 'kardiachain', 'bhundredhyun', 'pls', 'use', 'that', 'gun', 'on', 'i', 'baekhyun', 'exo', 'weareoneexo', 'please', 'vote', 'in', 'my', 'daily', 'poll', 'thank', 'do', 'you', 'think', 'donny', 'van', 'de', 'beek', 'should', 'leave', 'manchester', 'united', 'yes', 'or', 'no', 'i', 'have', 'see', 'bi', 'lingual', 'korean', 'speaker', 'slam', 'the', 'transl

In [7]:
# Process the Chinese weibo
wb_posts = [word for word in wb_posts if word != ''] # remove spaces
wb_posts = [word for word in wb_posts if word != 'ÂçïË∫´'] # remove the shows name
wb_posts = [word for word in wb_posts if word != 'Âú∞Áã±']


cleaned_wb_posts = []
for post in wb_posts:
    post = remove_emails(post)
    post = remove_urls(post)
    post = remove_hashtags(post)
    post = remove_html_tags(post)
    post = re.sub(r'[^\w\s\u4e00-\u9fff]', '', post)# avoid using remove_special_characters bcuz it removes Chinese characters
    segmented_post = ' '.join(jieba.cut(post))
    cleaned_wb_posts.append(segmented_post)

wb_posts = cleaned_wb_posts # replace the original tweets list with the processed one
print(wb_posts)

['Âç≥', '3', 'Á¨¨‰∫îÊúü', 'Êàë', 'ÁúüÁöÑ', 'Á¨ë', 'Âñú', '‰∏çÊÑß', 'ÊòØ', 'Èü©Â•≥', '', 'Âç≥', '3', '', 'ÂâçÂ§©', '‰∏ÄÂè£Ê∞î', 'Áúã', '‰∫Ü', '‰∏ÉÊúü', '', 'Âç≥', '3', '', '‰ªÄ‰πà', 'Êó∂ÂÄô', 'Êõ¥Êñ∞', 'Âïä', 'ÁùÄÊÄ•', 'Áúã', 'Áúã', 'Âç≥', '3', 'Â•Ω', 'ÂñúÊ¨¢', 'Â∑ßÂÖãÂäõ', 'Âì•', '11', 'Áúã', '‰∫Ü', 'ÊúÄÊñ∞', 'ÁöÑ', 'Âç≥', 'ÂêÉ', '‰∫Ü', 'ÈÖ∏Ë±ÜËßí', 'ÁÇíËÇâ', 'Â•Ω‰πÖÊ≤°', 'ÂêÉ', 'Á±≥È•≠', '‰∫Ü', 'ÊòéÂ§©', 'ÂºÄÂßã', 'Â∑•‰Ωú', 'Âä†Ê≤π', 'ÂêÉ', '‰∫Ü', 'Ë§™ÈªëÁ¥†', 'ÂáÜÂ§á', 'Áù°Ëßâ', 'ÊïàÊûú', 'Â§™Â•Ω‰∫Ü', 'ÂêÉ', '‰∫Ü', 'Âêé', 'ÂçÅÂá†ÂàÜÈíü', 'Â∑≤Áªè', 'Âõ∞Âæó', 'Ë∑ü', 'Áãó', '‰∏ÄÊ†∑', '‰∫Ü', '‰∏ÄÂêë', '‰∏ç', 'ÊÄéÈ∫º', 'Âñú', 'Ê≠°ÊàÄ', 'Á∂ú‰ΩÜ', 'ÂñÆË∫´', 'Âç≥', 'Âú∞ÁçÑ', '3', '‰ΩúÁÇ∫', 'ÂñúÂäá', 'Áúã', 'Êå∫', '‰∏çÈåØ', 'Áé∞ÂÆû', 'BE', 'ÂëúÂëú', 'ÂëúÂøÉ', 'Â•Ω', 'Áóõ', '', '', 'ÊÅãÁªº', '', 'Âç≥', '', 'ÂÆãÊô∫ÈõÖ', '', 'L', 'Áî∑Â•≥', 'ÈÇ£‰πà‰∫õ', '‰∫ã', 'ÁöÑ', 'ÂæÆÂçö', 'ËßÜÈ¢ë', 'ÊÉ≥', 'Áúã', 'Âç≥', 'Âíå', 'Êç¢‰πò', 'ÊÅãÁà±', '‰∫Ü', '', '', 'Ââ™Ëæë', 'Â§™', 'Â•ΩÁúã', 'Á¨¨‰∏ÄÊ¨°', 'ÊÉ≥', '

**Test for the law**

In [10]:
from collections import Counter
from math import log

#English
# Count the frequency of each word
word_freq = Counter(tweets)

# Create a list of tuples (word, frequency) sorted by frequency in descending order
freq_sorted = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Assign rank and calculate log frequency
word_rank_logfreq = [(word, freq, rank+1, log(freq)) for rank, (word, freq) in enumerate(freq_sorted)]

print(word_rank_logfreq)

[('be', 523, 1, 6.259581464064923), ('the', 459, 2, 6.129050210060545), ('i', 409, 3, 6.013715156042802), ('to', 222, 4, 5.402677381872279), ('of', 198, 5, 5.288267030694535), ('game', 196, 6, 5.278114659230517), ('a', 187, 7, 5.231108616854587), ('you', 181, 8, 5.198497031265826), ('it', 170, 9, 5.135798437050262), ('and', 168, 10, 5.123963979403259), ('not', 159, 11, 5.0689042022202315), ('in', 141, 12, 4.948759890378168), ('squid', 138, 13, 4.927253685157205), ('watch', 134, 14, 4.897839799950911), ('have', 132, 15, 4.882801922586371), ('that', 118, 16, 4.770684624465665), ('for', 111, 17, 4.709530201312334), ('do', 110, 18, 4.700480365792417), ('this', 106, 19, 4.663439094112067), ('on', 100, 20, 4.605170185988092), ('netflix', 94, 21, 4.543294782270004), ('just', 75, 22, 4.31748811353631), ('so', 74, 23, 4.30406509320417), ('get', 66, 24, 4.189654742026425), ('what', 63, 25, 4.143134726391533), ('episode', 61, 26, 4.110873864173311), ('if', 60, 27, 4.0943445622221), ('all', 60, 28

In [13]:
#Chinese
word_freq2 = Counter(wb_posts)


freq_sorted2 = sorted(word_freq2.items(), key=lambda x: x[1], reverse=True)

# Assign rank and calculate log frequency
word_rank_logfreq2 = [(word, freq, rank+1, log(freq)) for rank, (word, freq) in enumerate(freq_sorted2)]

print(word_rank_logfreq2)

[('', 5619, 1, 8.633908991112198), ('ÁöÑ', 1777, 2, 7.482681828154651), ('‰∫Ü', 1052, 3, 6.9584483932976555), ('Âç≥', 1043, 4, 6.949856455000773), ('Êàë', 689, 5, 6.535241271013659), ('3', 514, 6, 6.2422232654551655), ('ÊòØ', 454, 7, 6.118097198041348), ('Áúã', 385, 8, 5.953243334287785), ('Âú®', 326, 9, 5.786897381366708), ('ÁúüÁöÑ', 300, 10, 5.703782474656201), ('ÈÉΩ', 283, 11, 5.645446897643238), ('ÂñúÊ¨¢', 268, 12, 5.5909869805108565), ('‰πü', 250, 13, 5.521460917862246), ('‰ªñ', 244, 14, 5.497168225293202), ('‰∏ç', 242, 15, 5.488937726156687), ('Âíå', 242, 16, 5.488937726156687), ('Âæà', 233, 17, 5.4510384535657), ('Â•Ω', 217, 18, 5.37989735354046), ('ÂæÆÂçö', 213, 19, 5.3612921657094255), ('Â•≥', 209, 20, 5.342334251964811), ('Âïä', 198, 21, 5.288267030694535), ('ËØ¥', 193, 22, 5.262690188904886), ('ËßÜÈ¢ë', 191, 23, 5.25227342804663), ('ÊÅãÁªº', 188, 24, 5.236441962829949), ('Â∞±', 186, 25, 5.225746673713202), ('Â•π', 185, 26, 5.220355825078324), ('Âì•', 179, 27, 5.1873858058407