In [20]:
import pandas as pd
from datetime import date
import os
# Library to handle emojis in tweets
import demoji
# Library to use regular expressions
import re
import string
# Python library to handle natural language
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import nltk

In [27]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pranay\AppData\Roaming\nltk_data...


True

In [6]:
# Create an identifier containing the path to the folder
folder = "twitter_data"

# Read each file into a dataframe and store them in a list
dfs = []
count = 0
for file_name in os.listdir(folder):
    file_path = os.path.join(folder, file_name)
    df = pd.read_csv(file_path)
    dfs.append(df)
    
# Merge the dataframes 
tweets = pd.concat(dfs, axis=0, ignore_index=True)  
tweets.head()


Unnamed: 0,Tweet_ID,Time_Created,Text,Likes,Retweets,Location
0,1744146496243822931,2024-01-07 23:58:24+00:00,"@SelfMadeMastery Best: $NVDA, $CRWD, $META, $T...",1,0,
1,1744146280576926128,2024-01-07 23:57:32+00:00,Most Notable #Earnings Week of JAN 8th\n\n◦ Mo...,0,0,
2,1744145386850422822,2024-01-07 23:53:59+00:00,🇺🇸 U.S. ECONOMIC DATA 2ND WEEK\n\nTHURS.\n◦ U....,0,0,
3,1744145096592007411,2024-01-07 23:52:50+00:00,$MARA We nailed this play. I kept on hammering...,3,0,"Plan the Trade, Trade the Plan"
4,1743200305033175350,2024-01-05 09:18:34+00:00,$MARA Scaled out of this position yesterday\n\...,1,0,"Plan the Trade, Trade the Plan"


In [7]:
tweets.shape

(287059, 6)

In [8]:
# Checking for duplicates
tweets.duplicated(subset=['Tweet_ID']).sum()

16732

In [9]:
# Dropping duplicates
tweets = tweets.drop_duplicates(subset=['Tweet_ID'], keep='first').reset_index(drop=True)
tweets.shape

(270327, 6)

In [10]:
# Making sure duplicates were dropped
tweets.duplicated(subset=['Tweet_ID']).sum()

0

In [11]:
# Return the number of missing values in each column of the dataset
tweets.isnull().sum()

Tweet_ID             0
Time_Created         0
Text                 0
Likes                0
Retweets             0
Location        116472
dtype: int64

In [12]:
# Drop the location attribute
tweets  = tweets.drop("Location", axis = 1)
tweets.head()

Unnamed: 0,Tweet_ID,Time_Created,Text,Likes,Retweets
0,1744146496243822931,2024-01-07 23:58:24+00:00,"@SelfMadeMastery Best: $NVDA, $CRWD, $META, $T...",1,0
1,1744146280576926128,2024-01-07 23:57:32+00:00,Most Notable #Earnings Week of JAN 8th\n\n◦ Mo...,0,0
2,1744145386850422822,2024-01-07 23:53:59+00:00,🇺🇸 U.S. ECONOMIC DATA 2ND WEEK\n\nTHURS.\n◦ U....,0,0
3,1744145096592007411,2024-01-07 23:52:50+00:00,$MARA We nailed this play. I kept on hammering...,3,0
4,1743200305033175350,2024-01-05 09:18:34+00:00,$MARA Scaled out of this position yesterday\n\...,1,0


In [13]:
# Convert the time created attribute to dates 
tweets['Time_Created'] = pd.to_datetime(tweets['Time_Created']).dt.date
tweets.head()

Unnamed: 0,Tweet_ID,Time_Created,Text,Likes,Retweets
0,1744146496243822931,2024-01-07,"@SelfMadeMastery Best: $NVDA, $CRWD, $META, $T...",1,0
1,1744146280576926128,2024-01-07,Most Notable #Earnings Week of JAN 8th\n\n◦ Mo...,0,0
2,1744145386850422822,2024-01-07,🇺🇸 U.S. ECONOMIC DATA 2ND WEEK\n\nTHURS.\n◦ U....,0,0
3,1744145096592007411,2024-01-07,$MARA We nailed this play. I kept on hammering...,3,0
4,1743200305033175350,2024-01-05,$MARA Scaled out of this position yesterday\n\...,1,0


In [14]:
# Define the start and end dates of the range
start_date = date(2024, 1, 1)
end_date = date(2024, 3, 31)
# Filter the DataFrame based on the date range
filtered_tweets = tweets[(tweets['Time_Created'] >= start_date) & (tweets['Time_Created'] <= end_date)]
filtered_tweets.shape

(269199, 5)

In [22]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [23]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [28]:
# Applying text cleaning and then saving the filtered tweets as a csv file
filtered_tweets['cleaned'] = filtered_tweets["Text"].apply(lambda row:clean_text(row))
filtered_tweets.to_csv("twitter_data/NVDA_final-tweets")
print("Filtered tweets added to the folder")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['cleaned'] = filtered_tweets["Text"].apply(lambda row:clean_text(row))


Filtered tweets added to the folder


In [29]:
filtered_tweets.head()

Unnamed: 0,Tweet_ID,Time_Created,Text,Likes,Retweets,cleaned
0,1744146496243822931,2024-01-07,"@SelfMadeMastery Best: $NVDA, $CRWD, $META, $T...",1,0,best nvda crwd meta tsla bad enph use oppurtun...
1,1744146280576926128,2024-01-07,Most Notable #Earnings Week of JAN 8th\n\n◦ Mo...,0,0,notable earnings week jan mon accd tues tlry a...
2,1744145386850422822,2024-01-07,🇺🇸 U.S. ECONOMIC DATA 2ND WEEK\n\nTHURS.\n◦ U....,0,0,flag united state economic data week thurs cpi...
3,1744145096592007411,2024-01-07,$MARA We nailed this play. I kept on hammering...,3,0,mara nail play kept hammer risk reward hence l...
4,1743200305033175350,2024-01-05,$MARA Scaled out of this position yesterday\n\...,1,0,mara scale position yesterday last time specul...
