In [1]:
import pandas as pd
import datetime
import numpy as np

from collections import Counter

import re
from unidecode import unidecode
from emoji import UNICODE_EMOJI, demojize, emojize

import neattext.functions as ntf 

In [2]:
## read the datafile
tweet_df = pd.read_csv('Data/Kaggle/Twitter Jan Mar.csv')
tweet_df.head()

Unnamed: 0,date,id,content,username,like_count,retweet_count
0,2023-03-29 22:58:21+00:00,1641213230730051584,"Free AI marketing and automation tools, strate...",RealProfitPros,0.0,0.0
1,2023-03-29 22:58:18+00:00,1641213218520481805,@MecoleHardman4 Chat GPT says it’s 15. 😂,AmyLouWho321,0.0,0.0
2,2023-03-29 22:57:53+00:00,1641213115684536323,https://t.co/FjJSprt0te - Chat with any PDF!\n...,yjleon1976,0.0,0.0
3,2023-03-29 22:57:52+00:00,1641213110915571715,"AI muses: ""In the court of life, we must all f...",ChatGPT_Thinks,0.0,0.0
4,2023-03-29 22:57:26+00:00,1641213003260633088,Most people haven't heard of Chat GPT yet.\nFi...,nikocosmonaut,0.0,0.0


In [3]:
# Descriptive statistics
tweet_df.describe()

Unnamed: 0,like_count,retweet_count
count,499974.0,499974.0
mean,7.123208,1.481523
std,216.366469,46.254101
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,2.0,0.0
max,64094.0,16080.0


In [4]:
# Checking the number of unique values in each column
for col in tweet_df.columns:
    print(col, ":", tweet_df[col].nunique())

date : 475394
id : 500007
content : 493744
username : 250006
like_count : 1066
retweet_count : 489


In [5]:
#Average tweet length
count = 0
for i in tweet_df['content']:
    count += len(''.join(str(i).split()))

avg_tweet_length = count / len(tweet_df['content'])

print("Average Tweet length is:", avg_tweet_length)

Average Tweet length is: 142.63451831468134


## **Data Preprocessing**

In [6]:
# check for missing values
tweet_df.isnull().sum()

date              0
id                6
content           6
username         34
like_count       62
retweet_count    62
dtype: int64

In [7]:
# Remove missing values
print("length: ", len(tweet_df))
tweet_df = tweet_df.dropna()
print("Length: ",len(tweet_df))

length:  500036
Length:  499974


In [8]:
# Extract Date from date column
tweet_df['date'] = pd.to_datetime(tweet_df['date'])

tweet_df['date'] = tweet_df['date'].dt.date

tweet_df['date'] = pd.to_datetime(tweet_df['date'])

In [9]:
# Checking range of dates
print("Start Date: " ,tweet_df['date'].min())
print("End Date: " ,tweet_df['date'].max())

print("Duration of Dataset: ", (tweet_df['date'].max()- tweet_df['date'].min()).days, 'days')

Start Date:  2023-01-04 00:00:00
End Date:  2023-03-29 00:00:00
Duration of Dataset:  84 days


In [10]:
# Checking the number of unique values in each column
for col in tweet_df.columns:
    print(col, ":", tweet_df[col].nunique())

date : 85
id : 499974
content : 493705
username : 249998
like_count : 1066
retweet_count : 489


In [11]:
tweet_df.head()

Unnamed: 0,date,id,content,username,like_count,retweet_count
0,2023-03-29,1641213230730051584,"Free AI marketing and automation tools, strate...",RealProfitPros,0.0,0.0
1,2023-03-29,1641213218520481805,@MecoleHardman4 Chat GPT says it’s 15. 😂,AmyLouWho321,0.0,0.0
2,2023-03-29,1641213115684536323,https://t.co/FjJSprt0te - Chat with any PDF!\n...,yjleon1976,0.0,0.0
3,2023-03-29,1641213110915571715,"AI muses: ""In the court of life, we must all f...",ChatGPT_Thinks,0.0,0.0
4,2023-03-29,1641213003260633088,Most people haven't heard of Chat GPT yet.\nFi...,nikocosmonaut,0.0,0.0


In [12]:
# One preprocessing function to rule them all(almost!)
def pre_process(text):

    text = re.sub('&amp', 'and', text)
    text = re.sub('&lt', '<', text)
    text = re.sub('&gt', '>', text)

    # Remove new line characters
    text = re.sub('[\r\n]+', ' ', text)

    text = re.sub(r'@\w+', '', text)
    #text = re.sub(r'#\w+', '', text)
    #text = re.sub(r'@\w+', lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.group(0)), text) #Keeps the character trailing @
    text = re.sub(r'#\w+', lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.group(0)), text) #Keeps the character trailing #

    # Remove multiple space characters
    text = re.sub('\s+',' ', text)
    
    
    # remove first or last character space
    if len(text)>0:
        if text[0] == ' ':
            text = text[1:]
        
    if len(text)>0:    
        if text[-1] == ' ':
            text = text[:-1]
    
    # Convert to lowercase
    text = text.lower()
    return text

In [13]:
# removing stop words, emails and links
tweet_df['processed_content'] = tweet_df['content'].apply(ntf.remove_stopwords)
tweet_df['processed_content'] = tweet_df['processed_content'].apply(ntf.remove_emails)
tweet_df['processed_content'] = tweet_df['processed_content'].apply(ntf.remove_urls)

# normalize and deemojize
tweet_df['processed_content'] = tweet_df['processed_content'].apply(demojize)
tweet_df['processed_content'] = tweet_df['processed_content'].apply(unidecode)

# preprocessing
tweet_df['processed_content'] = tweet_df['processed_content'].apply(pre_process)


In [14]:
# Checking the number of unique values in each column
for col in tweet_df.columns:
    print(col, ":", tweet_df[col].nunique())

date : 85
id : 499974
content : 493705
username : 249998
like_count : 1066
retweet_count : 489
processed_content : 459701


### **Preprocessing 1.2:**

Removing Duplicates, Spam, Redundency



In [15]:
# Value counts for the 'content' column
content_counts = tweet_df['content'].value_counts()
print("Top 5 most frequent tweets in 'content':")
print(content_counts.head(5))

Top 5 most frequent tweets in 'content':
Chat GPT                126
@themattmic Chat GPT    121
@Timtwttt Chat GPT       84
te amo chat gpt          74
Chat gpt                 63
Name: content, dtype: int64


In [16]:
# Value counts for the 'preprocessed_content' column
preprocessed_content_counts = tweet_df['processed_content'].value_counts()
print("Top 5 most frequent tweets in 'preprocessed_content':")
print(preprocessed_content_counts.head(5))


Top 5 most frequent tweets in 'preprocessed_content':
chat gpt                                                                 4014
midjourney openai gpt stablediffusion2 dalle chatgpt join: imagine ''     645
chat gpt?                                                                 582
ask chat gpt                                                              333
use chat gpt                                                              220
Name: processed_content, dtype: int64


In [17]:
tweet_df.head()

Unnamed: 0,date,id,content,username,like_count,retweet_count,processed_content
0,2023-03-29,1641213230730051584,"Free AI marketing and automation tools, strate...",RealProfitPros,0.0,0.0,"free ai marketing automation tools, strategies..."
1,2023-03-29,1641213218520481805,@MecoleHardman4 Chat GPT says it’s 15. 😂,AmyLouWho321,0.0,0.0,chat gpt says it's 15. :face_with_tears_of_joy:
2,2023-03-29,1641213115684536323,https://t.co/FjJSprt0te - Chat with any PDF!\n...,yjleon1976,0.0,0.0,- chat pdf! check new ai quickly answers quest...
3,2023-03-29,1641213110915571715,"AI muses: ""In the court of life, we must all f...",ChatGPT_Thinks,0.0,0.0,"ai muses: ""in court life, face judge destiny j..."
4,2023-03-29,1641213003260633088,Most people haven't heard of Chat GPT yet.\nFi...,nikocosmonaut,0.0,0.0,"people heard chat gpt yet. first, elite factio..."


In [18]:
tweet_df['processed_content'][1]

"chat gpt says it's 15. :face_with_tears_of_joy:"

In [19]:
# Only keep the tweet with the highest engagement - REMOVING redundant data
df_sorted = tweet_df.sort_values(by='like_count', ascending=False)
df_cleaned = df_sorted.drop_duplicates(subset='processed_content', keep='first')

tweet_df = df_cleaned.sort_index()

In [20]:
print("Length of Dataframe: ",len(tweet_df))

Length of Dataframe:  459701


In [21]:
tweet_df = tweet_df[['date', 'username', 'like_count','retweet_count','processed_content']]
tweet_df.head()

Unnamed: 0,date,username,like_count,retweet_count,processed_content
0,2023-03-29,RealProfitPros,0.0,0.0,"free ai marketing automation tools, strategies..."
1,2023-03-29,AmyLouWho321,0.0,0.0,chat gpt says it's 15. :face_with_tears_of_joy:
2,2023-03-29,yjleon1976,0.0,0.0,- chat pdf! check new ai quickly answers quest...
3,2023-03-29,ChatGPT_Thinks,0.0,0.0,"ai muses: ""in court life, face judge destiny j..."
5,2023-03-29,cordydbarb,0.0,0.0,no! chat gpt putting amazing recipes :tired_fa...


This makes the length of our dataframe **459701 from nearly 500k**. A lot of redundency and repeated content. Remember there are always pros and cons of these steps but here pros outweight the cons for our analysis

In [22]:
tweet_df.to_csv('Data/cleaded_data.csv', index= None)

In [None]:
tweet_df.head